From 8cb50c0aaf9b7cac9369bd35d7bd038b51f50a33 Mon Sep 17 00:00:00 2001 From: ooooo <3164076421@qq.com> Date: Tue, 19 Aug 2025 20:50:42 +0800 Subject: [PATCH 1/2] [New Sample] Add Some Text Generation Computational Graph --- .../baidu/ERNIE-4.5-0.3B-PT/graph_hash.txt | 1 + .../baidu/ERNIE-4.5-0.3B-PT/graph_net.json | 6 + .../baidu/ERNIE-4.5-0.3B-PT/input_meta.py | 0 .../input_tensor_constraints.py | 0 .../baidu/ERNIE-4.5-0.3B-PT/model.py | 4359 +++++++++++ .../baidu/ERNIE-4.5-0.3B-PT/weight_meta.py | 1678 +++++ .../deepseek-coder-1.3b-base/graph_hash.txt | 1 + .../deepseek-coder-1.3b-base/graph_net.json | 6 + .../deepseek-coder-1.3b-base/input_meta.py | 0 .../input_tensor_constraints.py | 0 .../deepseek-coder-1.3b-base/model.py | 4555 +++++++++++ .../deepseek-coder-1.3b-base/weight_meta.py | 2218 ++++++ .../Phi-3-mini-4k-instruct/graph_hash.txt | 1 + .../Phi-3-mini-4k-instruct/graph_net.json | 6 + .../Phi-3-mini-4k-instruct/input_meta.py | 0 .../input_tensor_constraints.py | 0 .../microsoft/Phi-3-mini-4k-instruct/model.py | 5817 +++++++++++++++ .../Phi-3-mini-4k-instruct/weight_meta.py | 2007 +++++ .../Phi-3.5-mini-instruct/graph_hash.txt | 1 + .../Phi-3.5-mini-instruct/graph_net.json | 6 + .../Phi-3.5-mini-instruct/input_meta.py | 0 .../input_tensor_constraints.py | 0 .../microsoft/Phi-3.5-mini-instruct/model.py | 5736 ++++++++++++++ .../Phi-3.5-mini-instruct/weight_meta.py | 1968 +++++ .../Phi-4-mini-instruct/graph_hash.txt | 1 + .../Phi-4-mini-instruct/graph_net.json | 6 + .../Phi-4-mini-instruct/input_meta.py | 0 .../input_tensor_constraints.py | 0 .../microsoft/Phi-4-mini-instruct/model.py | 6632 +++++++++++++++++ .../Phi-4-mini-instruct/weight_meta.py | 1968 +++++ .../microsoft/phi-1/graph_hash.txt | 1 + .../microsoft/phi-1/graph_net.json | 6 + .../microsoft/phi-1/input_meta.py | 0 .../phi-1/input_tensor_constraints.py | 0 .../microsoft/phi-1/model.py | 4894 ++++++++++++ .../microsoft/phi-1/weight_meta.py | 3425 +++++++++ .../microsoft/phi-1_5/graph_hash.txt | 1 + .../microsoft/phi-1_5/graph_net.json | 6 + .../microsoft/phi-1_5/input_meta.py | 0 .../phi-1_5/input_tensor_constraints.py | 0 .../microsoft/phi-1_5/model.py | 4894 ++++++++++++ .../microsoft/phi-1_5/weight_meta.py | 3425 +++++++++ .../microsoft/phi-2/graph_hash.txt | 1 + .../microsoft/phi-2/graph_net.json | 6 + .../microsoft/phi-2/input_meta.py | 0 .../phi-2/input_tensor_constraints.py | 0 .../microsoft/phi-2/model.py | 6500 ++++++++++++++++ .../microsoft/phi-2/weight_meta.py | 4545 +++++++++++ .../openai-community/gpt2/graph_hash.txt | 1 + .../openai-community/gpt2/graph_net.json | 6 + .../openai-community/gpt2/input_meta.py | 0 .../gpt2/input_tensor_constraints.py | 0 .../openai-community/gpt2/model.py | 2449 ++++++ .../openai-community/gpt2/weight_meta.py | 1498 ++++ 54 files changed, 68631 insertions(+) create mode 100644 samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/graph_hash.txt create mode 100644 samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/graph_net.json create mode 100644 samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/input_meta.py create mode 100644 samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/model.py create mode 100644 samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/weight_meta.py create mode 100644 samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/graph_hash.txt create mode 100644 samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/graph_net.json create mode 100644 samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/input_meta.py create mode 100644 samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/model.py create mode 100644 samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/weight_meta.py create mode 100644 samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/graph_hash.txt create mode 100644 samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/graph_net.json create mode 100644 samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/input_meta.py create mode 100644 samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/model.py create mode 100644 samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/weight_meta.py create mode 100644 samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/graph_hash.txt create mode 100644 samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/graph_net.json create mode 100644 samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/input_meta.py create mode 100644 samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/model.py create mode 100644 samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/weight_meta.py create mode 100644 samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/graph_hash.txt create mode 100644 samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/graph_net.json create mode 100644 samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/input_meta.py create mode 100644 samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/model.py create mode 100644 samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/weight_meta.py create mode 100644 samples/transformers-auto-model/microsoft/phi-1/graph_hash.txt create mode 100644 samples/transformers-auto-model/microsoft/phi-1/graph_net.json create mode 100644 samples/transformers-auto-model/microsoft/phi-1/input_meta.py create mode 100644 samples/transformers-auto-model/microsoft/phi-1/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/microsoft/phi-1/model.py create mode 100644 samples/transformers-auto-model/microsoft/phi-1/weight_meta.py create mode 100644 samples/transformers-auto-model/microsoft/phi-1_5/graph_hash.txt create mode 100644 samples/transformers-auto-model/microsoft/phi-1_5/graph_net.json create mode 100644 samples/transformers-auto-model/microsoft/phi-1_5/input_meta.py create mode 100644 samples/transformers-auto-model/microsoft/phi-1_5/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/microsoft/phi-1_5/model.py create mode 100644 samples/transformers-auto-model/microsoft/phi-1_5/weight_meta.py create mode 100644 samples/transformers-auto-model/microsoft/phi-2/graph_hash.txt create mode 100644 samples/transformers-auto-model/microsoft/phi-2/graph_net.json create mode 100644 samples/transformers-auto-model/microsoft/phi-2/input_meta.py create mode 100644 samples/transformers-auto-model/microsoft/phi-2/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/microsoft/phi-2/model.py create mode 100644 samples/transformers-auto-model/microsoft/phi-2/weight_meta.py create mode 100644 samples/transformers-auto-model/openai-community/gpt2/graph_hash.txt create mode 100644 samples/transformers-auto-model/openai-community/gpt2/graph_net.json create mode 100644 samples/transformers-auto-model/openai-community/gpt2/input_meta.py create mode 100644 samples/transformers-auto-model/openai-community/gpt2/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/openai-community/gpt2/model.py create mode 100644 samples/transformers-auto-model/openai-community/gpt2/weight_meta.py diff --git a/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/graph_hash.txt b/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/graph_hash.txt new file mode 100644 index 000000000..08bf2432d --- /dev/null +++ b/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/graph_hash.txt @@ -0,0 +1 @@ +1db37dd049b1b2c8acc5343ca5a91f4cbefc906e83afcab42da4c5797d24e4de \ No newline at end of file diff --git a/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/graph_net.json b/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/input_meta.py b/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/input_tensor_constraints.py b/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/model.py b/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/model.py new file mode 100644 index 000000000..53a0261e4 --- /dev/null +++ b/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/model.py @@ -0,0 +1,4359 @@ +import torch + +from torch import device + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_inputs_embeds_: torch.Tensor, + L_attention_mask_: torch.Tensor, + L_self_modules_rotary_emb_buffers_inv_freq_: torch.Tensor, + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_norm_parameters_weight_: torch.nn.parameter.Parameter, + ): + l_inputs_embeds_ = L_inputs_embeds_ + l_attention_mask_ = L_attention_mask_ + l_self_modules_rotary_emb_buffers_inv_freq_ = ( + L_self_modules_rotary_emb_buffers_inv_freq_ + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_norm_parameters_weight_ = L_self_modules_norm_parameters_weight_ + cache_position = torch.arange(0, 2, device=device(type="cuda", index=0)) + position_ids = cache_position.unsqueeze(0) + attention_mask = l_attention_mask_.to( + device=device(type="cuda", index=0), dtype=torch.bool + ) + l_attention_mask_ = None + mask_indices = torch.arange(2, device=device(type="cuda", index=0)) + mask_indices += 0 + mask_indices_1 = mask_indices + mask_indices = None + local_padding_mask = attention_mask[(slice(None, None, None), mask_indices_1)] + attention_mask = mask_indices_1 = None + kv_arange = torch.arange(2, device=device(type="cuda", index=0)) + kv_arange += 0 + kv_arange_1 = kv_arange + kv_arange = None + reshaped_cache_position = cache_position.view(-1, 1) + cache_position = None + causal_mask = kv_arange_1 <= reshaped_cache_position + kv_arange_1 = reshaped_cache_position = None + getitem_1 = causal_mask[ + (None, None, slice(None, None, None), slice(None, None, None)) + ] + causal_mask = None + causal_mask_1 = getitem_1.expand(1, -1, -1, -1) + getitem_1 = None + getitem_2 = local_padding_mask[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + local_padding_mask = None + causal_mask_2 = causal_mask_1 * getitem_2 + causal_mask_1 = getitem_2 = None + _set_grad_enabled = torch._C._set_grad_enabled(False) + _set_grad_enabled = None + getitem_3 = l_self_modules_rotary_emb_buffers_inv_freq_[ + (None, slice(None, None, None), None) + ] + l_self_modules_rotary_emb_buffers_inv_freq_ = None + float_1 = getitem_3.float() + getitem_3 = None + expand_1 = float_1.expand(1, -1, 1) + float_1 = None + inv_freq_expanded = expand_1.to(device(type="cuda", index=0)) + expand_1 = None + getitem_4 = position_ids[ + (slice(None, None, None), None, slice(None, None, None)) + ] + position_ids = None + position_ids_expanded = getitem_4.float() + getitem_4 = None + float_3 = inv_freq_expanded.float() + inv_freq_expanded = None + float_4 = position_ids_expanded.float() + position_ids_expanded = None + matmul = float_3 @ float_4 + float_3 = float_4 = None + freqs = matmul.transpose(1, 2) + matmul = None + emb = torch.cat((freqs, freqs), dim=-1) + freqs = None + cos = emb.cos() + cos_1 = cos * 1.0 + cos = None + sin = emb.sin() + emb = None + sin_1 = sin * 1.0 + sin = None + _set_grad_enabled_1 = torch._C._set_grad_enabled(True) + _set_grad_enabled_1 = None + _log_api_usage_once = torch._C._log_api_usage_once("python.nn_module") + _log_api_usage_once = None + hidden_states = l_inputs_embeds_.to(torch.float32) + pow_1 = hidden_states.pow(2) + variance = pow_1.mean(-1, keepdim=True) + pow_1 = None + add = variance + 1e-05 + variance = None + rsqrt = torch.rsqrt(add) + add = None + hidden_states_1 = hidden_states * rsqrt + hidden_states = rsqrt = None + to_3 = hidden_states_1.to(torch.bfloat16) + hidden_states_1 = None + hidden_states_2 = ( + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + * to_3 + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + to_3 + ) = None + linear = torch._C._nn.linear( + hidden_states_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_1 = linear.view((1, 2, -1, 128)) + linear = None + query_states = view_1.transpose(1, 2) + view_1 = None + linear_1 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_2 = linear_1.view((1, 2, -1, 128)) + linear_1 = None + key_states = view_2.transpose(1, 2) + view_2 = None + linear_2 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_2 = l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_3 = linear_2.view((1, 2, -1, 128)) + linear_2 = None + value_states = view_3.transpose(1, 2) + view_3 = None + cos_2 = cos_1.unsqueeze(1) + sin_2 = sin_1.unsqueeze(1) + getitem_5 = cos_2[(Ellipsis, slice(None, 64, None))] + cos_2 = None + cos_3 = getitem_5.repeat_interleave(2, dim=-1) + getitem_5 = None + getitem_6 = sin_2[(Ellipsis, slice(None, 64, None))] + sin_2 = None + sin_3 = getitem_6.repeat_interleave(2, dim=-1) + getitem_6 = None + float_5 = query_states.float() + mul_5 = float_5 * cos_3 + float_5 = None + x1 = query_states[(Ellipsis, slice(0, None, 2))] + x2 = query_states[(Ellipsis, slice(1, None, 2))] + query_states = None + neg = -x2 + x2 = None + stack = torch.stack((neg, x1), dim=-1) + neg = x1 = None + flatten = stack.flatten(-2) + stack = None + float_6 = flatten.float() + flatten = None + mul_6 = float_6 * sin_3 + float_6 = None + q_embed = mul_5 + mul_6 + mul_5 = mul_6 = None + float_7 = key_states.float() + mul_7 = float_7 * cos_3 + float_7 = cos_3 = None + x1_1 = key_states[(Ellipsis, slice(0, None, 2))] + x2_1 = key_states[(Ellipsis, slice(1, None, 2))] + key_states = None + neg_1 = -x2_1 + x2_1 = None + stack_1 = torch.stack((neg_1, x1_1), dim=-1) + neg_1 = x1_1 = None + flatten_1 = stack_1.flatten(-2) + stack_1 = None + float_8 = flatten_1.float() + flatten_1 = None + mul_8 = float_8 * sin_3 + float_8 = sin_3 = None + k_embed = mul_7 + mul_8 + mul_7 = mul_8 = None + query_states_1 = q_embed.to(torch.bfloat16) + q_embed = None + key_states_1 = k_embed.to(torch.bfloat16) + k_embed = None + getitem_11 = key_states_1[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_3 = getitem_11.expand(1, 2, 8, 2, 128) + getitem_11 = None + key = hidden_states_3.reshape(1, 16, 2, 128) + hidden_states_3 = None + getitem_12 = value_states[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_4 = getitem_12.expand(1, 2, 8, 2, 128) + getitem_12 = None + value = hidden_states_4.reshape(1, 16, 2, 128) + hidden_states_4 = None + attention_mask_1 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query = query_states_1.contiguous() + query_states_1 = None + key_1 = key.contiguous() + key = None + value_1 = value.contiguous() + value = None + attn_output = torch._C._nn.scaled_dot_product_attention( + query, + key_1, + value_1, + attn_mask=attention_mask_1, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query = key_1 = value_1 = attention_mask_1 = None + transpose_4 = attn_output.transpose(1, 2) + attn_output = None + attn_output_1 = transpose_4.contiguous() + transpose_4 = None + reshape_2 = attn_output_1.reshape(1, 2, -1) + attn_output_1 = None + attn_output_2 = reshape_2.contiguous() + reshape_2 = None + attn_output_3 = torch._C._nn.linear( + attn_output_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_2 = l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_5 = l_inputs_embeds_ + attn_output_3 + l_inputs_embeds_ = attn_output_3 = None + hidden_states_6 = hidden_states_5.to(torch.float32) + pow_2 = hidden_states_6.pow(2) + variance_1 = pow_2.mean(-1, keepdim=True) + pow_2 = None + add_4 = variance_1 + 1e-05 + variance_1 = None + rsqrt_1 = torch.rsqrt(add_4) + add_4 = None + hidden_states_7 = hidden_states_6 * rsqrt_1 + hidden_states_6 = rsqrt_1 = None + to_7 = hidden_states_7.to(torch.bfloat16) + hidden_states_7 = None + hidden_states_8 = ( + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + * to_7 + ) + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = ( + to_7 + ) = None + linear_4 = torch._C._nn.linear( + hidden_states_8, + l_self_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu = torch.nn.functional.silu(linear_4, inplace=False) + linear_4 = None + linear_5 = torch._C._nn.linear( + hidden_states_8, + l_self_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_8 = l_self_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_11 = silu * linear_5 + silu = linear_5 = None + down_proj = torch._C._nn.linear( + mul_11, + l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_11 = l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_9 = hidden_states_5 + down_proj + hidden_states_5 = down_proj = None + hidden_states_10 = hidden_states_9.to(torch.float32) + pow_3 = hidden_states_10.pow(2) + variance_2 = pow_3.mean(-1, keepdim=True) + pow_3 = None + add_6 = variance_2 + 1e-05 + variance_2 = None + rsqrt_2 = torch.rsqrt(add_6) + add_6 = None + hidden_states_11 = hidden_states_10 * rsqrt_2 + hidden_states_10 = rsqrt_2 = None + to_9 = hidden_states_11.to(torch.bfloat16) + hidden_states_11 = None + hidden_states_12 = ( + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + * to_9 + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + to_9 + ) = None + linear_7 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_4 = linear_7.view((1, 2, -1, 128)) + linear_7 = None + query_states_2 = view_4.transpose(1, 2) + view_4 = None + linear_8 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_5 = linear_8.view((1, 2, -1, 128)) + linear_8 = None + key_states_2 = view_5.transpose(1, 2) + view_5 = None + linear_9 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_12 = l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_6 = linear_9.view((1, 2, -1, 128)) + linear_9 = None + value_states_1 = view_6.transpose(1, 2) + view_6 = None + cos_4 = cos_1.unsqueeze(1) + sin_4 = sin_1.unsqueeze(1) + getitem_14 = cos_4[(Ellipsis, slice(None, 64, None))] + cos_4 = None + cos_5 = getitem_14.repeat_interleave(2, dim=-1) + getitem_14 = None + getitem_15 = sin_4[(Ellipsis, slice(None, 64, None))] + sin_4 = None + sin_5 = getitem_15.repeat_interleave(2, dim=-1) + getitem_15 = None + float_9 = query_states_2.float() + mul_14 = float_9 * cos_5 + float_9 = None + x1_2 = query_states_2[(Ellipsis, slice(0, None, 2))] + x2_2 = query_states_2[(Ellipsis, slice(1, None, 2))] + query_states_2 = None + neg_2 = -x2_2 + x2_2 = None + stack_2 = torch.stack((neg_2, x1_2), dim=-1) + neg_2 = x1_2 = None + flatten_2 = stack_2.flatten(-2) + stack_2 = None + float_10 = flatten_2.float() + flatten_2 = None + mul_15 = float_10 * sin_5 + float_10 = None + q_embed_1 = mul_14 + mul_15 + mul_14 = mul_15 = None + float_11 = key_states_2.float() + mul_16 = float_11 * cos_5 + float_11 = cos_5 = None + x1_3 = key_states_2[(Ellipsis, slice(0, None, 2))] + x2_3 = key_states_2[(Ellipsis, slice(1, None, 2))] + key_states_2 = None + neg_3 = -x2_3 + x2_3 = None + stack_3 = torch.stack((neg_3, x1_3), dim=-1) + neg_3 = x1_3 = None + flatten_3 = stack_3.flatten(-2) + stack_3 = None + float_12 = flatten_3.float() + flatten_3 = None + mul_17 = float_12 * sin_5 + float_12 = sin_5 = None + k_embed_1 = mul_16 + mul_17 + mul_16 = mul_17 = None + query_states_3 = q_embed_1.to(torch.bfloat16) + q_embed_1 = None + key_states_3 = k_embed_1.to(torch.bfloat16) + k_embed_1 = None + getitem_20 = key_states_3[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_13 = getitem_20.expand(1, 2, 8, 2, 128) + getitem_20 = None + key_2 = hidden_states_13.reshape(1, 16, 2, 128) + hidden_states_13 = None + getitem_21 = value_states_1[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_14 = getitem_21.expand(1, 2, 8, 2, 128) + getitem_21 = None + value_2 = hidden_states_14.reshape(1, 16, 2, 128) + hidden_states_14 = None + attention_mask_2 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_1 = query_states_3.contiguous() + query_states_3 = None + key_3 = key_2.contiguous() + key_2 = None + value_3 = value_2.contiguous() + value_2 = None + attn_output_4 = torch._C._nn.scaled_dot_product_attention( + query_1, + key_3, + value_3, + attn_mask=attention_mask_2, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_1 = key_3 = value_3 = attention_mask_2 = None + transpose_8 = attn_output_4.transpose(1, 2) + attn_output_4 = None + attn_output_5 = transpose_8.contiguous() + transpose_8 = None + reshape_5 = attn_output_5.reshape(1, 2, -1) + attn_output_5 = None + attn_output_6 = reshape_5.contiguous() + reshape_5 = None + attn_output_7 = torch._C._nn.linear( + attn_output_6, + l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_6 = l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_15 = hidden_states_9 + attn_output_7 + hidden_states_9 = attn_output_7 = None + hidden_states_16 = hidden_states_15.to(torch.float32) + pow_4 = hidden_states_16.pow(2) + variance_3 = pow_4.mean(-1, keepdim=True) + pow_4 = None + add_10 = variance_3 + 1e-05 + variance_3 = None + rsqrt_3 = torch.rsqrt(add_10) + add_10 = None + hidden_states_17 = hidden_states_16 * rsqrt_3 + hidden_states_16 = rsqrt_3 = None + to_13 = hidden_states_17.to(torch.bfloat16) + hidden_states_17 = None + hidden_states_18 = ( + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + * to_13 + ) + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = ( + to_13 + ) = None + linear_11 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_1 = torch.nn.functional.silu(linear_11, inplace=False) + linear_11 = None + linear_12 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_18 = l_self_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_20 = silu_1 * linear_12 + silu_1 = linear_12 = None + down_proj_1 = torch._C._nn.linear( + mul_20, + l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_20 = l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_19 = hidden_states_15 + down_proj_1 + hidden_states_15 = down_proj_1 = None + hidden_states_20 = hidden_states_19.to(torch.float32) + pow_5 = hidden_states_20.pow(2) + variance_4 = pow_5.mean(-1, keepdim=True) + pow_5 = None + add_12 = variance_4 + 1e-05 + variance_4 = None + rsqrt_4 = torch.rsqrt(add_12) + add_12 = None + hidden_states_21 = hidden_states_20 * rsqrt_4 + hidden_states_20 = rsqrt_4 = None + to_15 = hidden_states_21.to(torch.bfloat16) + hidden_states_21 = None + hidden_states_22 = ( + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + * to_15 + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + to_15 + ) = None + linear_14 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_7 = linear_14.view((1, 2, -1, 128)) + linear_14 = None + query_states_4 = view_7.transpose(1, 2) + view_7 = None + linear_15 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_8 = linear_15.view((1, 2, -1, 128)) + linear_15 = None + key_states_4 = view_8.transpose(1, 2) + view_8 = None + linear_16 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_22 = l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_9 = linear_16.view((1, 2, -1, 128)) + linear_16 = None + value_states_2 = view_9.transpose(1, 2) + view_9 = None + cos_6 = cos_1.unsqueeze(1) + sin_6 = sin_1.unsqueeze(1) + getitem_23 = cos_6[(Ellipsis, slice(None, 64, None))] + cos_6 = None + cos_7 = getitem_23.repeat_interleave(2, dim=-1) + getitem_23 = None + getitem_24 = sin_6[(Ellipsis, slice(None, 64, None))] + sin_6 = None + sin_7 = getitem_24.repeat_interleave(2, dim=-1) + getitem_24 = None + float_13 = query_states_4.float() + mul_23 = float_13 * cos_7 + float_13 = None + x1_4 = query_states_4[(Ellipsis, slice(0, None, 2))] + x2_4 = query_states_4[(Ellipsis, slice(1, None, 2))] + query_states_4 = None + neg_4 = -x2_4 + x2_4 = None + stack_4 = torch.stack((neg_4, x1_4), dim=-1) + neg_4 = x1_4 = None + flatten_4 = stack_4.flatten(-2) + stack_4 = None + float_14 = flatten_4.float() + flatten_4 = None + mul_24 = float_14 * sin_7 + float_14 = None + q_embed_2 = mul_23 + mul_24 + mul_23 = mul_24 = None + float_15 = key_states_4.float() + mul_25 = float_15 * cos_7 + float_15 = cos_7 = None + x1_5 = key_states_4[(Ellipsis, slice(0, None, 2))] + x2_5 = key_states_4[(Ellipsis, slice(1, None, 2))] + key_states_4 = None + neg_5 = -x2_5 + x2_5 = None + stack_5 = torch.stack((neg_5, x1_5), dim=-1) + neg_5 = x1_5 = None + flatten_5 = stack_5.flatten(-2) + stack_5 = None + float_16 = flatten_5.float() + flatten_5 = None + mul_26 = float_16 * sin_7 + float_16 = sin_7 = None + k_embed_2 = mul_25 + mul_26 + mul_25 = mul_26 = None + query_states_5 = q_embed_2.to(torch.bfloat16) + q_embed_2 = None + key_states_5 = k_embed_2.to(torch.bfloat16) + k_embed_2 = None + getitem_29 = key_states_5[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_23 = getitem_29.expand(1, 2, 8, 2, 128) + getitem_29 = None + key_4 = hidden_states_23.reshape(1, 16, 2, 128) + hidden_states_23 = None + getitem_30 = value_states_2[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_24 = getitem_30.expand(1, 2, 8, 2, 128) + getitem_30 = None + value_4 = hidden_states_24.reshape(1, 16, 2, 128) + hidden_states_24 = None + attention_mask_3 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_2 = query_states_5.contiguous() + query_states_5 = None + key_5 = key_4.contiguous() + key_4 = None + value_5 = value_4.contiguous() + value_4 = None + attn_output_8 = torch._C._nn.scaled_dot_product_attention( + query_2, + key_5, + value_5, + attn_mask=attention_mask_3, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_2 = key_5 = value_5 = attention_mask_3 = None + transpose_12 = attn_output_8.transpose(1, 2) + attn_output_8 = None + attn_output_9 = transpose_12.contiguous() + transpose_12 = None + reshape_8 = attn_output_9.reshape(1, 2, -1) + attn_output_9 = None + attn_output_10 = reshape_8.contiguous() + reshape_8 = None + attn_output_11 = torch._C._nn.linear( + attn_output_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_10 = l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_25 = hidden_states_19 + attn_output_11 + hidden_states_19 = attn_output_11 = None + hidden_states_26 = hidden_states_25.to(torch.float32) + pow_6 = hidden_states_26.pow(2) + variance_5 = pow_6.mean(-1, keepdim=True) + pow_6 = None + add_16 = variance_5 + 1e-05 + variance_5 = None + rsqrt_5 = torch.rsqrt(add_16) + add_16 = None + hidden_states_27 = hidden_states_26 * rsqrt_5 + hidden_states_26 = rsqrt_5 = None + to_19 = hidden_states_27.to(torch.bfloat16) + hidden_states_27 = None + hidden_states_28 = ( + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + * to_19 + ) + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = ( + to_19 + ) = None + linear_18 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_2 = torch.nn.functional.silu(linear_18, inplace=False) + linear_18 = None + linear_19 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_28 = l_self_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_29 = silu_2 * linear_19 + silu_2 = linear_19 = None + down_proj_2 = torch._C._nn.linear( + mul_29, + l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_29 = l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_29 = hidden_states_25 + down_proj_2 + hidden_states_25 = down_proj_2 = None + hidden_states_30 = hidden_states_29.to(torch.float32) + pow_7 = hidden_states_30.pow(2) + variance_6 = pow_7.mean(-1, keepdim=True) + pow_7 = None + add_18 = variance_6 + 1e-05 + variance_6 = None + rsqrt_6 = torch.rsqrt(add_18) + add_18 = None + hidden_states_31 = hidden_states_30 * rsqrt_6 + hidden_states_30 = rsqrt_6 = None + to_21 = hidden_states_31.to(torch.bfloat16) + hidden_states_31 = None + hidden_states_32 = ( + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + * to_21 + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + to_21 + ) = None + linear_21 = torch._C._nn.linear( + hidden_states_32, + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_10 = linear_21.view((1, 2, -1, 128)) + linear_21 = None + query_states_6 = view_10.transpose(1, 2) + view_10 = None + linear_22 = torch._C._nn.linear( + hidden_states_32, + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_11 = linear_22.view((1, 2, -1, 128)) + linear_22 = None + key_states_6 = view_11.transpose(1, 2) + view_11 = None + linear_23 = torch._C._nn.linear( + hidden_states_32, + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_32 = l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_12 = linear_23.view((1, 2, -1, 128)) + linear_23 = None + value_states_3 = view_12.transpose(1, 2) + view_12 = None + cos_8 = cos_1.unsqueeze(1) + sin_8 = sin_1.unsqueeze(1) + getitem_32 = cos_8[(Ellipsis, slice(None, 64, None))] + cos_8 = None + cos_9 = getitem_32.repeat_interleave(2, dim=-1) + getitem_32 = None + getitem_33 = sin_8[(Ellipsis, slice(None, 64, None))] + sin_8 = None + sin_9 = getitem_33.repeat_interleave(2, dim=-1) + getitem_33 = None + float_17 = query_states_6.float() + mul_32 = float_17 * cos_9 + float_17 = None + x1_6 = query_states_6[(Ellipsis, slice(0, None, 2))] + x2_6 = query_states_6[(Ellipsis, slice(1, None, 2))] + query_states_6 = None + neg_6 = -x2_6 + x2_6 = None + stack_6 = torch.stack((neg_6, x1_6), dim=-1) + neg_6 = x1_6 = None + flatten_6 = stack_6.flatten(-2) + stack_6 = None + float_18 = flatten_6.float() + flatten_6 = None + mul_33 = float_18 * sin_9 + float_18 = None + q_embed_3 = mul_32 + mul_33 + mul_32 = mul_33 = None + float_19 = key_states_6.float() + mul_34 = float_19 * cos_9 + float_19 = cos_9 = None + x1_7 = key_states_6[(Ellipsis, slice(0, None, 2))] + x2_7 = key_states_6[(Ellipsis, slice(1, None, 2))] + key_states_6 = None + neg_7 = -x2_7 + x2_7 = None + stack_7 = torch.stack((neg_7, x1_7), dim=-1) + neg_7 = x1_7 = None + flatten_7 = stack_7.flatten(-2) + stack_7 = None + float_20 = flatten_7.float() + flatten_7 = None + mul_35 = float_20 * sin_9 + float_20 = sin_9 = None + k_embed_3 = mul_34 + mul_35 + mul_34 = mul_35 = None + query_states_7 = q_embed_3.to(torch.bfloat16) + q_embed_3 = None + key_states_7 = k_embed_3.to(torch.bfloat16) + k_embed_3 = None + getitem_38 = key_states_7[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_33 = getitem_38.expand(1, 2, 8, 2, 128) + getitem_38 = None + key_6 = hidden_states_33.reshape(1, 16, 2, 128) + hidden_states_33 = None + getitem_39 = value_states_3[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_34 = getitem_39.expand(1, 2, 8, 2, 128) + getitem_39 = None + value_6 = hidden_states_34.reshape(1, 16, 2, 128) + hidden_states_34 = None + attention_mask_4 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_3 = query_states_7.contiguous() + query_states_7 = None + key_7 = key_6.contiguous() + key_6 = None + value_7 = value_6.contiguous() + value_6 = None + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_3, + key_7, + value_7, + attn_mask=attention_mask_4, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_3 = key_7 = value_7 = attention_mask_4 = None + transpose_16 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_13 = transpose_16.contiguous() + transpose_16 = None + reshape_11 = attn_output_13.reshape(1, 2, -1) + attn_output_13 = None + attn_output_14 = reshape_11.contiguous() + reshape_11 = None + attn_output_15 = torch._C._nn.linear( + attn_output_14, + l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_14 = l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_35 = hidden_states_29 + attn_output_15 + hidden_states_29 = attn_output_15 = None + hidden_states_36 = hidden_states_35.to(torch.float32) + pow_8 = hidden_states_36.pow(2) + variance_7 = pow_8.mean(-1, keepdim=True) + pow_8 = None + add_22 = variance_7 + 1e-05 + variance_7 = None + rsqrt_7 = torch.rsqrt(add_22) + add_22 = None + hidden_states_37 = hidden_states_36 * rsqrt_7 + hidden_states_36 = rsqrt_7 = None + to_25 = hidden_states_37.to(torch.bfloat16) + hidden_states_37 = None + hidden_states_38 = ( + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + * to_25 + ) + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = ( + to_25 + ) = None + linear_25 = torch._C._nn.linear( + hidden_states_38, + l_self_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_3 = torch.nn.functional.silu(linear_25, inplace=False) + linear_25 = None + linear_26 = torch._C._nn.linear( + hidden_states_38, + l_self_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_38 = l_self_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_38 = silu_3 * linear_26 + silu_3 = linear_26 = None + down_proj_3 = torch._C._nn.linear( + mul_38, + l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_38 = l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_39 = hidden_states_35 + down_proj_3 + hidden_states_35 = down_proj_3 = None + hidden_states_40 = hidden_states_39.to(torch.float32) + pow_9 = hidden_states_40.pow(2) + variance_8 = pow_9.mean(-1, keepdim=True) + pow_9 = None + add_24 = variance_8 + 1e-05 + variance_8 = None + rsqrt_8 = torch.rsqrt(add_24) + add_24 = None + hidden_states_41 = hidden_states_40 * rsqrt_8 + hidden_states_40 = rsqrt_8 = None + to_27 = hidden_states_41.to(torch.bfloat16) + hidden_states_41 = None + hidden_states_42 = ( + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + * to_27 + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + to_27 + ) = None + linear_28 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_13 = linear_28.view((1, 2, -1, 128)) + linear_28 = None + query_states_8 = view_13.transpose(1, 2) + view_13 = None + linear_29 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_14 = linear_29.view((1, 2, -1, 128)) + linear_29 = None + key_states_8 = view_14.transpose(1, 2) + view_14 = None + linear_30 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_42 = l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_15 = linear_30.view((1, 2, -1, 128)) + linear_30 = None + value_states_4 = view_15.transpose(1, 2) + view_15 = None + cos_10 = cos_1.unsqueeze(1) + sin_10 = sin_1.unsqueeze(1) + getitem_41 = cos_10[(Ellipsis, slice(None, 64, None))] + cos_10 = None + cos_11 = getitem_41.repeat_interleave(2, dim=-1) + getitem_41 = None + getitem_42 = sin_10[(Ellipsis, slice(None, 64, None))] + sin_10 = None + sin_11 = getitem_42.repeat_interleave(2, dim=-1) + getitem_42 = None + float_21 = query_states_8.float() + mul_41 = float_21 * cos_11 + float_21 = None + x1_8 = query_states_8[(Ellipsis, slice(0, None, 2))] + x2_8 = query_states_8[(Ellipsis, slice(1, None, 2))] + query_states_8 = None + neg_8 = -x2_8 + x2_8 = None + stack_8 = torch.stack((neg_8, x1_8), dim=-1) + neg_8 = x1_8 = None + flatten_8 = stack_8.flatten(-2) + stack_8 = None + float_22 = flatten_8.float() + flatten_8 = None + mul_42 = float_22 * sin_11 + float_22 = None + q_embed_4 = mul_41 + mul_42 + mul_41 = mul_42 = None + float_23 = key_states_8.float() + mul_43 = float_23 * cos_11 + float_23 = cos_11 = None + x1_9 = key_states_8[(Ellipsis, slice(0, None, 2))] + x2_9 = key_states_8[(Ellipsis, slice(1, None, 2))] + key_states_8 = None + neg_9 = -x2_9 + x2_9 = None + stack_9 = torch.stack((neg_9, x1_9), dim=-1) + neg_9 = x1_9 = None + flatten_9 = stack_9.flatten(-2) + stack_9 = None + float_24 = flatten_9.float() + flatten_9 = None + mul_44 = float_24 * sin_11 + float_24 = sin_11 = None + k_embed_4 = mul_43 + mul_44 + mul_43 = mul_44 = None + query_states_9 = q_embed_4.to(torch.bfloat16) + q_embed_4 = None + key_states_9 = k_embed_4.to(torch.bfloat16) + k_embed_4 = None + getitem_47 = key_states_9[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_43 = getitem_47.expand(1, 2, 8, 2, 128) + getitem_47 = None + key_8 = hidden_states_43.reshape(1, 16, 2, 128) + hidden_states_43 = None + getitem_48 = value_states_4[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_44 = getitem_48.expand(1, 2, 8, 2, 128) + getitem_48 = None + value_8 = hidden_states_44.reshape(1, 16, 2, 128) + hidden_states_44 = None + attention_mask_5 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_4 = query_states_9.contiguous() + query_states_9 = None + key_9 = key_8.contiguous() + key_8 = None + value_9 = value_8.contiguous() + value_8 = None + attn_output_16 = torch._C._nn.scaled_dot_product_attention( + query_4, + key_9, + value_9, + attn_mask=attention_mask_5, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_4 = key_9 = value_9 = attention_mask_5 = None + transpose_20 = attn_output_16.transpose(1, 2) + attn_output_16 = None + attn_output_17 = transpose_20.contiguous() + transpose_20 = None + reshape_14 = attn_output_17.reshape(1, 2, -1) + attn_output_17 = None + attn_output_18 = reshape_14.contiguous() + reshape_14 = None + attn_output_19 = torch._C._nn.linear( + attn_output_18, + l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_18 = l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_45 = hidden_states_39 + attn_output_19 + hidden_states_39 = attn_output_19 = None + hidden_states_46 = hidden_states_45.to(torch.float32) + pow_10 = hidden_states_46.pow(2) + variance_9 = pow_10.mean(-1, keepdim=True) + pow_10 = None + add_28 = variance_9 + 1e-05 + variance_9 = None + rsqrt_9 = torch.rsqrt(add_28) + add_28 = None + hidden_states_47 = hidden_states_46 * rsqrt_9 + hidden_states_46 = rsqrt_9 = None + to_31 = hidden_states_47.to(torch.bfloat16) + hidden_states_47 = None + hidden_states_48 = ( + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + * to_31 + ) + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = ( + to_31 + ) = None + linear_32 = torch._C._nn.linear( + hidden_states_48, + l_self_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_4 = torch.nn.functional.silu(linear_32, inplace=False) + linear_32 = None + linear_33 = torch._C._nn.linear( + hidden_states_48, + l_self_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_48 = l_self_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_47 = silu_4 * linear_33 + silu_4 = linear_33 = None + down_proj_4 = torch._C._nn.linear( + mul_47, + l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_47 = l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_49 = hidden_states_45 + down_proj_4 + hidden_states_45 = down_proj_4 = None + hidden_states_50 = hidden_states_49.to(torch.float32) + pow_11 = hidden_states_50.pow(2) + variance_10 = pow_11.mean(-1, keepdim=True) + pow_11 = None + add_30 = variance_10 + 1e-05 + variance_10 = None + rsqrt_10 = torch.rsqrt(add_30) + add_30 = None + hidden_states_51 = hidden_states_50 * rsqrt_10 + hidden_states_50 = rsqrt_10 = None + to_33 = hidden_states_51.to(torch.bfloat16) + hidden_states_51 = None + hidden_states_52 = ( + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + * to_33 + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + to_33 + ) = None + linear_35 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_16 = linear_35.view((1, 2, -1, 128)) + linear_35 = None + query_states_10 = view_16.transpose(1, 2) + view_16 = None + linear_36 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_17 = linear_36.view((1, 2, -1, 128)) + linear_36 = None + key_states_10 = view_17.transpose(1, 2) + view_17 = None + linear_37 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_52 = l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_18 = linear_37.view((1, 2, -1, 128)) + linear_37 = None + value_states_5 = view_18.transpose(1, 2) + view_18 = None + cos_12 = cos_1.unsqueeze(1) + sin_12 = sin_1.unsqueeze(1) + getitem_50 = cos_12[(Ellipsis, slice(None, 64, None))] + cos_12 = None + cos_13 = getitem_50.repeat_interleave(2, dim=-1) + getitem_50 = None + getitem_51 = sin_12[(Ellipsis, slice(None, 64, None))] + sin_12 = None + sin_13 = getitem_51.repeat_interleave(2, dim=-1) + getitem_51 = None + float_25 = query_states_10.float() + mul_50 = float_25 * cos_13 + float_25 = None + x1_10 = query_states_10[(Ellipsis, slice(0, None, 2))] + x2_10 = query_states_10[(Ellipsis, slice(1, None, 2))] + query_states_10 = None + neg_10 = -x2_10 + x2_10 = None + stack_10 = torch.stack((neg_10, x1_10), dim=-1) + neg_10 = x1_10 = None + flatten_10 = stack_10.flatten(-2) + stack_10 = None + float_26 = flatten_10.float() + flatten_10 = None + mul_51 = float_26 * sin_13 + float_26 = None + q_embed_5 = mul_50 + mul_51 + mul_50 = mul_51 = None + float_27 = key_states_10.float() + mul_52 = float_27 * cos_13 + float_27 = cos_13 = None + x1_11 = key_states_10[(Ellipsis, slice(0, None, 2))] + x2_11 = key_states_10[(Ellipsis, slice(1, None, 2))] + key_states_10 = None + neg_11 = -x2_11 + x2_11 = None + stack_11 = torch.stack((neg_11, x1_11), dim=-1) + neg_11 = x1_11 = None + flatten_11 = stack_11.flatten(-2) + stack_11 = None + float_28 = flatten_11.float() + flatten_11 = None + mul_53 = float_28 * sin_13 + float_28 = sin_13 = None + k_embed_5 = mul_52 + mul_53 + mul_52 = mul_53 = None + query_states_11 = q_embed_5.to(torch.bfloat16) + q_embed_5 = None + key_states_11 = k_embed_5.to(torch.bfloat16) + k_embed_5 = None + getitem_56 = key_states_11[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_53 = getitem_56.expand(1, 2, 8, 2, 128) + getitem_56 = None + key_10 = hidden_states_53.reshape(1, 16, 2, 128) + hidden_states_53 = None + getitem_57 = value_states_5[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_54 = getitem_57.expand(1, 2, 8, 2, 128) + getitem_57 = None + value_10 = hidden_states_54.reshape(1, 16, 2, 128) + hidden_states_54 = None + attention_mask_6 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_5 = query_states_11.contiguous() + query_states_11 = None + key_11 = key_10.contiguous() + key_10 = None + value_11 = value_10.contiguous() + value_10 = None + attn_output_20 = torch._C._nn.scaled_dot_product_attention( + query_5, + key_11, + value_11, + attn_mask=attention_mask_6, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_5 = key_11 = value_11 = attention_mask_6 = None + transpose_24 = attn_output_20.transpose(1, 2) + attn_output_20 = None + attn_output_21 = transpose_24.contiguous() + transpose_24 = None + reshape_17 = attn_output_21.reshape(1, 2, -1) + attn_output_21 = None + attn_output_22 = reshape_17.contiguous() + reshape_17 = None + attn_output_23 = torch._C._nn.linear( + attn_output_22, + l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_22 = l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_55 = hidden_states_49 + attn_output_23 + hidden_states_49 = attn_output_23 = None + hidden_states_56 = hidden_states_55.to(torch.float32) + pow_12 = hidden_states_56.pow(2) + variance_11 = pow_12.mean(-1, keepdim=True) + pow_12 = None + add_34 = variance_11 + 1e-05 + variance_11 = None + rsqrt_11 = torch.rsqrt(add_34) + add_34 = None + hidden_states_57 = hidden_states_56 * rsqrt_11 + hidden_states_56 = rsqrt_11 = None + to_37 = hidden_states_57.to(torch.bfloat16) + hidden_states_57 = None + hidden_states_58 = ( + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + * to_37 + ) + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = ( + to_37 + ) = None + linear_39 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_5 = torch.nn.functional.silu(linear_39, inplace=False) + linear_39 = None + linear_40 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_58 = l_self_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_56 = silu_5 * linear_40 + silu_5 = linear_40 = None + down_proj_5 = torch._C._nn.linear( + mul_56, + l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_56 = l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_59 = hidden_states_55 + down_proj_5 + hidden_states_55 = down_proj_5 = None + hidden_states_60 = hidden_states_59.to(torch.float32) + pow_13 = hidden_states_60.pow(2) + variance_12 = pow_13.mean(-1, keepdim=True) + pow_13 = None + add_36 = variance_12 + 1e-05 + variance_12 = None + rsqrt_12 = torch.rsqrt(add_36) + add_36 = None + hidden_states_61 = hidden_states_60 * rsqrt_12 + hidden_states_60 = rsqrt_12 = None + to_39 = hidden_states_61.to(torch.bfloat16) + hidden_states_61 = None + hidden_states_62 = ( + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + * to_39 + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + to_39 + ) = None + linear_42 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_19 = linear_42.view((1, 2, -1, 128)) + linear_42 = None + query_states_12 = view_19.transpose(1, 2) + view_19 = None + linear_43 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_20 = linear_43.view((1, 2, -1, 128)) + linear_43 = None + key_states_12 = view_20.transpose(1, 2) + view_20 = None + linear_44 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_62 = l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_21 = linear_44.view((1, 2, -1, 128)) + linear_44 = None + value_states_6 = view_21.transpose(1, 2) + view_21 = None + cos_14 = cos_1.unsqueeze(1) + sin_14 = sin_1.unsqueeze(1) + getitem_59 = cos_14[(Ellipsis, slice(None, 64, None))] + cos_14 = None + cos_15 = getitem_59.repeat_interleave(2, dim=-1) + getitem_59 = None + getitem_60 = sin_14[(Ellipsis, slice(None, 64, None))] + sin_14 = None + sin_15 = getitem_60.repeat_interleave(2, dim=-1) + getitem_60 = None + float_29 = query_states_12.float() + mul_59 = float_29 * cos_15 + float_29 = None + x1_12 = query_states_12[(Ellipsis, slice(0, None, 2))] + x2_12 = query_states_12[(Ellipsis, slice(1, None, 2))] + query_states_12 = None + neg_12 = -x2_12 + x2_12 = None + stack_12 = torch.stack((neg_12, x1_12), dim=-1) + neg_12 = x1_12 = None + flatten_12 = stack_12.flatten(-2) + stack_12 = None + float_30 = flatten_12.float() + flatten_12 = None + mul_60 = float_30 * sin_15 + float_30 = None + q_embed_6 = mul_59 + mul_60 + mul_59 = mul_60 = None + float_31 = key_states_12.float() + mul_61 = float_31 * cos_15 + float_31 = cos_15 = None + x1_13 = key_states_12[(Ellipsis, slice(0, None, 2))] + x2_13 = key_states_12[(Ellipsis, slice(1, None, 2))] + key_states_12 = None + neg_13 = -x2_13 + x2_13 = None + stack_13 = torch.stack((neg_13, x1_13), dim=-1) + neg_13 = x1_13 = None + flatten_13 = stack_13.flatten(-2) + stack_13 = None + float_32 = flatten_13.float() + flatten_13 = None + mul_62 = float_32 * sin_15 + float_32 = sin_15 = None + k_embed_6 = mul_61 + mul_62 + mul_61 = mul_62 = None + query_states_13 = q_embed_6.to(torch.bfloat16) + q_embed_6 = None + key_states_13 = k_embed_6.to(torch.bfloat16) + k_embed_6 = None + getitem_65 = key_states_13[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_63 = getitem_65.expand(1, 2, 8, 2, 128) + getitem_65 = None + key_12 = hidden_states_63.reshape(1, 16, 2, 128) + hidden_states_63 = None + getitem_66 = value_states_6[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_64 = getitem_66.expand(1, 2, 8, 2, 128) + getitem_66 = None + value_12 = hidden_states_64.reshape(1, 16, 2, 128) + hidden_states_64 = None + attention_mask_7 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_6 = query_states_13.contiguous() + query_states_13 = None + key_13 = key_12.contiguous() + key_12 = None + value_13 = value_12.contiguous() + value_12 = None + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + query_6, + key_13, + value_13, + attn_mask=attention_mask_7, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_6 = key_13 = value_13 = attention_mask_7 = None + transpose_28 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_25 = transpose_28.contiguous() + transpose_28 = None + reshape_20 = attn_output_25.reshape(1, 2, -1) + attn_output_25 = None + attn_output_26 = reshape_20.contiguous() + reshape_20 = None + attn_output_27 = torch._C._nn.linear( + attn_output_26, + l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_26 = l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_65 = hidden_states_59 + attn_output_27 + hidden_states_59 = attn_output_27 = None + hidden_states_66 = hidden_states_65.to(torch.float32) + pow_14 = hidden_states_66.pow(2) + variance_13 = pow_14.mean(-1, keepdim=True) + pow_14 = None + add_40 = variance_13 + 1e-05 + variance_13 = None + rsqrt_13 = torch.rsqrt(add_40) + add_40 = None + hidden_states_67 = hidden_states_66 * rsqrt_13 + hidden_states_66 = rsqrt_13 = None + to_43 = hidden_states_67.to(torch.bfloat16) + hidden_states_67 = None + hidden_states_68 = ( + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + * to_43 + ) + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = ( + to_43 + ) = None + linear_46 = torch._C._nn.linear( + hidden_states_68, + l_self_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_6 = torch.nn.functional.silu(linear_46, inplace=False) + linear_46 = None + linear_47 = torch._C._nn.linear( + hidden_states_68, + l_self_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_68 = l_self_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_65 = silu_6 * linear_47 + silu_6 = linear_47 = None + down_proj_6 = torch._C._nn.linear( + mul_65, + l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_65 = l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_69 = hidden_states_65 + down_proj_6 + hidden_states_65 = down_proj_6 = None + hidden_states_70 = hidden_states_69.to(torch.float32) + pow_15 = hidden_states_70.pow(2) + variance_14 = pow_15.mean(-1, keepdim=True) + pow_15 = None + add_42 = variance_14 + 1e-05 + variance_14 = None + rsqrt_14 = torch.rsqrt(add_42) + add_42 = None + hidden_states_71 = hidden_states_70 * rsqrt_14 + hidden_states_70 = rsqrt_14 = None + to_45 = hidden_states_71.to(torch.bfloat16) + hidden_states_71 = None + hidden_states_72 = ( + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + * to_45 + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + to_45 + ) = None + linear_49 = torch._C._nn.linear( + hidden_states_72, + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_22 = linear_49.view((1, 2, -1, 128)) + linear_49 = None + query_states_14 = view_22.transpose(1, 2) + view_22 = None + linear_50 = torch._C._nn.linear( + hidden_states_72, + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_23 = linear_50.view((1, 2, -1, 128)) + linear_50 = None + key_states_14 = view_23.transpose(1, 2) + view_23 = None + linear_51 = torch._C._nn.linear( + hidden_states_72, + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_72 = l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_24 = linear_51.view((1, 2, -1, 128)) + linear_51 = None + value_states_7 = view_24.transpose(1, 2) + view_24 = None + cos_16 = cos_1.unsqueeze(1) + sin_16 = sin_1.unsqueeze(1) + getitem_68 = cos_16[(Ellipsis, slice(None, 64, None))] + cos_16 = None + cos_17 = getitem_68.repeat_interleave(2, dim=-1) + getitem_68 = None + getitem_69 = sin_16[(Ellipsis, slice(None, 64, None))] + sin_16 = None + sin_17 = getitem_69.repeat_interleave(2, dim=-1) + getitem_69 = None + float_33 = query_states_14.float() + mul_68 = float_33 * cos_17 + float_33 = None + x1_14 = query_states_14[(Ellipsis, slice(0, None, 2))] + x2_14 = query_states_14[(Ellipsis, slice(1, None, 2))] + query_states_14 = None + neg_14 = -x2_14 + x2_14 = None + stack_14 = torch.stack((neg_14, x1_14), dim=-1) + neg_14 = x1_14 = None + flatten_14 = stack_14.flatten(-2) + stack_14 = None + float_34 = flatten_14.float() + flatten_14 = None + mul_69 = float_34 * sin_17 + float_34 = None + q_embed_7 = mul_68 + mul_69 + mul_68 = mul_69 = None + float_35 = key_states_14.float() + mul_70 = float_35 * cos_17 + float_35 = cos_17 = None + x1_15 = key_states_14[(Ellipsis, slice(0, None, 2))] + x2_15 = key_states_14[(Ellipsis, slice(1, None, 2))] + key_states_14 = None + neg_15 = -x2_15 + x2_15 = None + stack_15 = torch.stack((neg_15, x1_15), dim=-1) + neg_15 = x1_15 = None + flatten_15 = stack_15.flatten(-2) + stack_15 = None + float_36 = flatten_15.float() + flatten_15 = None + mul_71 = float_36 * sin_17 + float_36 = sin_17 = None + k_embed_7 = mul_70 + mul_71 + mul_70 = mul_71 = None + query_states_15 = q_embed_7.to(torch.bfloat16) + q_embed_7 = None + key_states_15 = k_embed_7.to(torch.bfloat16) + k_embed_7 = None + getitem_74 = key_states_15[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_73 = getitem_74.expand(1, 2, 8, 2, 128) + getitem_74 = None + key_14 = hidden_states_73.reshape(1, 16, 2, 128) + hidden_states_73 = None + getitem_75 = value_states_7[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_74 = getitem_75.expand(1, 2, 8, 2, 128) + getitem_75 = None + value_14 = hidden_states_74.reshape(1, 16, 2, 128) + hidden_states_74 = None + attention_mask_8 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_7 = query_states_15.contiguous() + query_states_15 = None + key_15 = key_14.contiguous() + key_14 = None + value_15 = value_14.contiguous() + value_14 = None + attn_output_28 = torch._C._nn.scaled_dot_product_attention( + query_7, + key_15, + value_15, + attn_mask=attention_mask_8, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_7 = key_15 = value_15 = attention_mask_8 = None + transpose_32 = attn_output_28.transpose(1, 2) + attn_output_28 = None + attn_output_29 = transpose_32.contiguous() + transpose_32 = None + reshape_23 = attn_output_29.reshape(1, 2, -1) + attn_output_29 = None + attn_output_30 = reshape_23.contiguous() + reshape_23 = None + attn_output_31 = torch._C._nn.linear( + attn_output_30, + l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_30 = l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_75 = hidden_states_69 + attn_output_31 + hidden_states_69 = attn_output_31 = None + hidden_states_76 = hidden_states_75.to(torch.float32) + pow_16 = hidden_states_76.pow(2) + variance_15 = pow_16.mean(-1, keepdim=True) + pow_16 = None + add_46 = variance_15 + 1e-05 + variance_15 = None + rsqrt_15 = torch.rsqrt(add_46) + add_46 = None + hidden_states_77 = hidden_states_76 * rsqrt_15 + hidden_states_76 = rsqrt_15 = None + to_49 = hidden_states_77.to(torch.bfloat16) + hidden_states_77 = None + hidden_states_78 = ( + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + * to_49 + ) + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = ( + to_49 + ) = None + linear_53 = torch._C._nn.linear( + hidden_states_78, + l_self_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_7 = torch.nn.functional.silu(linear_53, inplace=False) + linear_53 = None + linear_54 = torch._C._nn.linear( + hidden_states_78, + l_self_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_78 = l_self_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_74 = silu_7 * linear_54 + silu_7 = linear_54 = None + down_proj_7 = torch._C._nn.linear( + mul_74, + l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_74 = l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_79 = hidden_states_75 + down_proj_7 + hidden_states_75 = down_proj_7 = None + hidden_states_80 = hidden_states_79.to(torch.float32) + pow_17 = hidden_states_80.pow(2) + variance_16 = pow_17.mean(-1, keepdim=True) + pow_17 = None + add_48 = variance_16 + 1e-05 + variance_16 = None + rsqrt_16 = torch.rsqrt(add_48) + add_48 = None + hidden_states_81 = hidden_states_80 * rsqrt_16 + hidden_states_80 = rsqrt_16 = None + to_51 = hidden_states_81.to(torch.bfloat16) + hidden_states_81 = None + hidden_states_82 = ( + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + * to_51 + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + to_51 + ) = None + linear_56 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_25 = linear_56.view((1, 2, -1, 128)) + linear_56 = None + query_states_16 = view_25.transpose(1, 2) + view_25 = None + linear_57 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_26 = linear_57.view((1, 2, -1, 128)) + linear_57 = None + key_states_16 = view_26.transpose(1, 2) + view_26 = None + linear_58 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_82 = l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_27 = linear_58.view((1, 2, -1, 128)) + linear_58 = None + value_states_8 = view_27.transpose(1, 2) + view_27 = None + cos_18 = cos_1.unsqueeze(1) + sin_18 = sin_1.unsqueeze(1) + getitem_77 = cos_18[(Ellipsis, slice(None, 64, None))] + cos_18 = None + cos_19 = getitem_77.repeat_interleave(2, dim=-1) + getitem_77 = None + getitem_78 = sin_18[(Ellipsis, slice(None, 64, None))] + sin_18 = None + sin_19 = getitem_78.repeat_interleave(2, dim=-1) + getitem_78 = None + float_37 = query_states_16.float() + mul_77 = float_37 * cos_19 + float_37 = None + x1_16 = query_states_16[(Ellipsis, slice(0, None, 2))] + x2_16 = query_states_16[(Ellipsis, slice(1, None, 2))] + query_states_16 = None + neg_16 = -x2_16 + x2_16 = None + stack_16 = torch.stack((neg_16, x1_16), dim=-1) + neg_16 = x1_16 = None + flatten_16 = stack_16.flatten(-2) + stack_16 = None + float_38 = flatten_16.float() + flatten_16 = None + mul_78 = float_38 * sin_19 + float_38 = None + q_embed_8 = mul_77 + mul_78 + mul_77 = mul_78 = None + float_39 = key_states_16.float() + mul_79 = float_39 * cos_19 + float_39 = cos_19 = None + x1_17 = key_states_16[(Ellipsis, slice(0, None, 2))] + x2_17 = key_states_16[(Ellipsis, slice(1, None, 2))] + key_states_16 = None + neg_17 = -x2_17 + x2_17 = None + stack_17 = torch.stack((neg_17, x1_17), dim=-1) + neg_17 = x1_17 = None + flatten_17 = stack_17.flatten(-2) + stack_17 = None + float_40 = flatten_17.float() + flatten_17 = None + mul_80 = float_40 * sin_19 + float_40 = sin_19 = None + k_embed_8 = mul_79 + mul_80 + mul_79 = mul_80 = None + query_states_17 = q_embed_8.to(torch.bfloat16) + q_embed_8 = None + key_states_17 = k_embed_8.to(torch.bfloat16) + k_embed_8 = None + getitem_83 = key_states_17[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_83 = getitem_83.expand(1, 2, 8, 2, 128) + getitem_83 = None + key_16 = hidden_states_83.reshape(1, 16, 2, 128) + hidden_states_83 = None + getitem_84 = value_states_8[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_84 = getitem_84.expand(1, 2, 8, 2, 128) + getitem_84 = None + value_16 = hidden_states_84.reshape(1, 16, 2, 128) + hidden_states_84 = None + attention_mask_9 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_8 = query_states_17.contiguous() + query_states_17 = None + key_17 = key_16.contiguous() + key_16 = None + value_17 = value_16.contiguous() + value_16 = None + attn_output_32 = torch._C._nn.scaled_dot_product_attention( + query_8, + key_17, + value_17, + attn_mask=attention_mask_9, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_8 = key_17 = value_17 = attention_mask_9 = None + transpose_36 = attn_output_32.transpose(1, 2) + attn_output_32 = None + attn_output_33 = transpose_36.contiguous() + transpose_36 = None + reshape_26 = attn_output_33.reshape(1, 2, -1) + attn_output_33 = None + attn_output_34 = reshape_26.contiguous() + reshape_26 = None + attn_output_35 = torch._C._nn.linear( + attn_output_34, + l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_34 = l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_85 = hidden_states_79 + attn_output_35 + hidden_states_79 = attn_output_35 = None + hidden_states_86 = hidden_states_85.to(torch.float32) + pow_18 = hidden_states_86.pow(2) + variance_17 = pow_18.mean(-1, keepdim=True) + pow_18 = None + add_52 = variance_17 + 1e-05 + variance_17 = None + rsqrt_17 = torch.rsqrt(add_52) + add_52 = None + hidden_states_87 = hidden_states_86 * rsqrt_17 + hidden_states_86 = rsqrt_17 = None + to_55 = hidden_states_87.to(torch.bfloat16) + hidden_states_87 = None + hidden_states_88 = ( + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + * to_55 + ) + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = ( + to_55 + ) = None + linear_60 = torch._C._nn.linear( + hidden_states_88, + l_self_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_8 = torch.nn.functional.silu(linear_60, inplace=False) + linear_60 = None + linear_61 = torch._C._nn.linear( + hidden_states_88, + l_self_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_88 = l_self_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_83 = silu_8 * linear_61 + silu_8 = linear_61 = None + down_proj_8 = torch._C._nn.linear( + mul_83, + l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_83 = l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_89 = hidden_states_85 + down_proj_8 + hidden_states_85 = down_proj_8 = None + hidden_states_90 = hidden_states_89.to(torch.float32) + pow_19 = hidden_states_90.pow(2) + variance_18 = pow_19.mean(-1, keepdim=True) + pow_19 = None + add_54 = variance_18 + 1e-05 + variance_18 = None + rsqrt_18 = torch.rsqrt(add_54) + add_54 = None + hidden_states_91 = hidden_states_90 * rsqrt_18 + hidden_states_90 = rsqrt_18 = None + to_57 = hidden_states_91.to(torch.bfloat16) + hidden_states_91 = None + hidden_states_92 = ( + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + * to_57 + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + to_57 + ) = None + linear_63 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_28 = linear_63.view((1, 2, -1, 128)) + linear_63 = None + query_states_18 = view_28.transpose(1, 2) + view_28 = None + linear_64 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_29 = linear_64.view((1, 2, -1, 128)) + linear_64 = None + key_states_18 = view_29.transpose(1, 2) + view_29 = None + linear_65 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_92 = l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_30 = linear_65.view((1, 2, -1, 128)) + linear_65 = None + value_states_9 = view_30.transpose(1, 2) + view_30 = None + cos_20 = cos_1.unsqueeze(1) + sin_20 = sin_1.unsqueeze(1) + getitem_86 = cos_20[(Ellipsis, slice(None, 64, None))] + cos_20 = None + cos_21 = getitem_86.repeat_interleave(2, dim=-1) + getitem_86 = None + getitem_87 = sin_20[(Ellipsis, slice(None, 64, None))] + sin_20 = None + sin_21 = getitem_87.repeat_interleave(2, dim=-1) + getitem_87 = None + float_41 = query_states_18.float() + mul_86 = float_41 * cos_21 + float_41 = None + x1_18 = query_states_18[(Ellipsis, slice(0, None, 2))] + x2_18 = query_states_18[(Ellipsis, slice(1, None, 2))] + query_states_18 = None + neg_18 = -x2_18 + x2_18 = None + stack_18 = torch.stack((neg_18, x1_18), dim=-1) + neg_18 = x1_18 = None + flatten_18 = stack_18.flatten(-2) + stack_18 = None + float_42 = flatten_18.float() + flatten_18 = None + mul_87 = float_42 * sin_21 + float_42 = None + q_embed_9 = mul_86 + mul_87 + mul_86 = mul_87 = None + float_43 = key_states_18.float() + mul_88 = float_43 * cos_21 + float_43 = cos_21 = None + x1_19 = key_states_18[(Ellipsis, slice(0, None, 2))] + x2_19 = key_states_18[(Ellipsis, slice(1, None, 2))] + key_states_18 = None + neg_19 = -x2_19 + x2_19 = None + stack_19 = torch.stack((neg_19, x1_19), dim=-1) + neg_19 = x1_19 = None + flatten_19 = stack_19.flatten(-2) + stack_19 = None + float_44 = flatten_19.float() + flatten_19 = None + mul_89 = float_44 * sin_21 + float_44 = sin_21 = None + k_embed_9 = mul_88 + mul_89 + mul_88 = mul_89 = None + query_states_19 = q_embed_9.to(torch.bfloat16) + q_embed_9 = None + key_states_19 = k_embed_9.to(torch.bfloat16) + k_embed_9 = None + getitem_92 = key_states_19[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_93 = getitem_92.expand(1, 2, 8, 2, 128) + getitem_92 = None + key_18 = hidden_states_93.reshape(1, 16, 2, 128) + hidden_states_93 = None + getitem_93 = value_states_9[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_94 = getitem_93.expand(1, 2, 8, 2, 128) + getitem_93 = None + value_18 = hidden_states_94.reshape(1, 16, 2, 128) + hidden_states_94 = None + attention_mask_10 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_9 = query_states_19.contiguous() + query_states_19 = None + key_19 = key_18.contiguous() + key_18 = None + value_19 = value_18.contiguous() + value_18 = None + attn_output_36 = torch._C._nn.scaled_dot_product_attention( + query_9, + key_19, + value_19, + attn_mask=attention_mask_10, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_9 = key_19 = value_19 = attention_mask_10 = None + transpose_40 = attn_output_36.transpose(1, 2) + attn_output_36 = None + attn_output_37 = transpose_40.contiguous() + transpose_40 = None + reshape_29 = attn_output_37.reshape(1, 2, -1) + attn_output_37 = None + attn_output_38 = reshape_29.contiguous() + reshape_29 = None + attn_output_39 = torch._C._nn.linear( + attn_output_38, + l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_38 = l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_95 = hidden_states_89 + attn_output_39 + hidden_states_89 = attn_output_39 = None + hidden_states_96 = hidden_states_95.to(torch.float32) + pow_20 = hidden_states_96.pow(2) + variance_19 = pow_20.mean(-1, keepdim=True) + pow_20 = None + add_58 = variance_19 + 1e-05 + variance_19 = None + rsqrt_19 = torch.rsqrt(add_58) + add_58 = None + hidden_states_97 = hidden_states_96 * rsqrt_19 + hidden_states_96 = rsqrt_19 = None + to_61 = hidden_states_97.to(torch.bfloat16) + hidden_states_97 = None + hidden_states_98 = ( + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + * to_61 + ) + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = ( + to_61 + ) = None + linear_67 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_9 = torch.nn.functional.silu(linear_67, inplace=False) + linear_67 = None + linear_68 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_98 = l_self_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_92 = silu_9 * linear_68 + silu_9 = linear_68 = None + down_proj_9 = torch._C._nn.linear( + mul_92, + l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_92 = l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_99 = hidden_states_95 + down_proj_9 + hidden_states_95 = down_proj_9 = None + hidden_states_100 = hidden_states_99.to(torch.float32) + pow_21 = hidden_states_100.pow(2) + variance_20 = pow_21.mean(-1, keepdim=True) + pow_21 = None + add_60 = variance_20 + 1e-05 + variance_20 = None + rsqrt_20 = torch.rsqrt(add_60) + add_60 = None + hidden_states_101 = hidden_states_100 * rsqrt_20 + hidden_states_100 = rsqrt_20 = None + to_63 = hidden_states_101.to(torch.bfloat16) + hidden_states_101 = None + hidden_states_102 = ( + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + * to_63 + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + to_63 + ) = None + linear_70 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_31 = linear_70.view((1, 2, -1, 128)) + linear_70 = None + query_states_20 = view_31.transpose(1, 2) + view_31 = None + linear_71 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_32 = linear_71.view((1, 2, -1, 128)) + linear_71 = None + key_states_20 = view_32.transpose(1, 2) + view_32 = None + linear_72 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_102 = l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_33 = linear_72.view((1, 2, -1, 128)) + linear_72 = None + value_states_10 = view_33.transpose(1, 2) + view_33 = None + cos_22 = cos_1.unsqueeze(1) + sin_22 = sin_1.unsqueeze(1) + getitem_95 = cos_22[(Ellipsis, slice(None, 64, None))] + cos_22 = None + cos_23 = getitem_95.repeat_interleave(2, dim=-1) + getitem_95 = None + getitem_96 = sin_22[(Ellipsis, slice(None, 64, None))] + sin_22 = None + sin_23 = getitem_96.repeat_interleave(2, dim=-1) + getitem_96 = None + float_45 = query_states_20.float() + mul_95 = float_45 * cos_23 + float_45 = None + x1_20 = query_states_20[(Ellipsis, slice(0, None, 2))] + x2_20 = query_states_20[(Ellipsis, slice(1, None, 2))] + query_states_20 = None + neg_20 = -x2_20 + x2_20 = None + stack_20 = torch.stack((neg_20, x1_20), dim=-1) + neg_20 = x1_20 = None + flatten_20 = stack_20.flatten(-2) + stack_20 = None + float_46 = flatten_20.float() + flatten_20 = None + mul_96 = float_46 * sin_23 + float_46 = None + q_embed_10 = mul_95 + mul_96 + mul_95 = mul_96 = None + float_47 = key_states_20.float() + mul_97 = float_47 * cos_23 + float_47 = cos_23 = None + x1_21 = key_states_20[(Ellipsis, slice(0, None, 2))] + x2_21 = key_states_20[(Ellipsis, slice(1, None, 2))] + key_states_20 = None + neg_21 = -x2_21 + x2_21 = None + stack_21 = torch.stack((neg_21, x1_21), dim=-1) + neg_21 = x1_21 = None + flatten_21 = stack_21.flatten(-2) + stack_21 = None + float_48 = flatten_21.float() + flatten_21 = None + mul_98 = float_48 * sin_23 + float_48 = sin_23 = None + k_embed_10 = mul_97 + mul_98 + mul_97 = mul_98 = None + query_states_21 = q_embed_10.to(torch.bfloat16) + q_embed_10 = None + key_states_21 = k_embed_10.to(torch.bfloat16) + k_embed_10 = None + getitem_101 = key_states_21[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_103 = getitem_101.expand(1, 2, 8, 2, 128) + getitem_101 = None + key_20 = hidden_states_103.reshape(1, 16, 2, 128) + hidden_states_103 = None + getitem_102 = value_states_10[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_104 = getitem_102.expand(1, 2, 8, 2, 128) + getitem_102 = None + value_20 = hidden_states_104.reshape(1, 16, 2, 128) + hidden_states_104 = None + attention_mask_11 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_10 = query_states_21.contiguous() + query_states_21 = None + key_21 = key_20.contiguous() + key_20 = None + value_21 = value_20.contiguous() + value_20 = None + attn_output_40 = torch._C._nn.scaled_dot_product_attention( + query_10, + key_21, + value_21, + attn_mask=attention_mask_11, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_10 = key_21 = value_21 = attention_mask_11 = None + transpose_44 = attn_output_40.transpose(1, 2) + attn_output_40 = None + attn_output_41 = transpose_44.contiguous() + transpose_44 = None + reshape_32 = attn_output_41.reshape(1, 2, -1) + attn_output_41 = None + attn_output_42 = reshape_32.contiguous() + reshape_32 = None + attn_output_43 = torch._C._nn.linear( + attn_output_42, + l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_42 = l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_105 = hidden_states_99 + attn_output_43 + hidden_states_99 = attn_output_43 = None + hidden_states_106 = hidden_states_105.to(torch.float32) + pow_22 = hidden_states_106.pow(2) + variance_21 = pow_22.mean(-1, keepdim=True) + pow_22 = None + add_64 = variance_21 + 1e-05 + variance_21 = None + rsqrt_21 = torch.rsqrt(add_64) + add_64 = None + hidden_states_107 = hidden_states_106 * rsqrt_21 + hidden_states_106 = rsqrt_21 = None + to_67 = hidden_states_107.to(torch.bfloat16) + hidden_states_107 = None + hidden_states_108 = ( + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + * to_67 + ) + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = ( + to_67 + ) = None + linear_74 = torch._C._nn.linear( + hidden_states_108, + l_self_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_10 = torch.nn.functional.silu(linear_74, inplace=False) + linear_74 = None + linear_75 = torch._C._nn.linear( + hidden_states_108, + l_self_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_108 = l_self_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_101 = silu_10 * linear_75 + silu_10 = linear_75 = None + down_proj_10 = torch._C._nn.linear( + mul_101, + l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_101 = l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_109 = hidden_states_105 + down_proj_10 + hidden_states_105 = down_proj_10 = None + hidden_states_110 = hidden_states_109.to(torch.float32) + pow_23 = hidden_states_110.pow(2) + variance_22 = pow_23.mean(-1, keepdim=True) + pow_23 = None + add_66 = variance_22 + 1e-05 + variance_22 = None + rsqrt_22 = torch.rsqrt(add_66) + add_66 = None + hidden_states_111 = hidden_states_110 * rsqrt_22 + hidden_states_110 = rsqrt_22 = None + to_69 = hidden_states_111.to(torch.bfloat16) + hidden_states_111 = None + hidden_states_112 = ( + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + * to_69 + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + to_69 + ) = None + linear_77 = torch._C._nn.linear( + hidden_states_112, + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_34 = linear_77.view((1, 2, -1, 128)) + linear_77 = None + query_states_22 = view_34.transpose(1, 2) + view_34 = None + linear_78 = torch._C._nn.linear( + hidden_states_112, + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_35 = linear_78.view((1, 2, -1, 128)) + linear_78 = None + key_states_22 = view_35.transpose(1, 2) + view_35 = None + linear_79 = torch._C._nn.linear( + hidden_states_112, + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_112 = l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_36 = linear_79.view((1, 2, -1, 128)) + linear_79 = None + value_states_11 = view_36.transpose(1, 2) + view_36 = None + cos_24 = cos_1.unsqueeze(1) + sin_24 = sin_1.unsqueeze(1) + getitem_104 = cos_24[(Ellipsis, slice(None, 64, None))] + cos_24 = None + cos_25 = getitem_104.repeat_interleave(2, dim=-1) + getitem_104 = None + getitem_105 = sin_24[(Ellipsis, slice(None, 64, None))] + sin_24 = None + sin_25 = getitem_105.repeat_interleave(2, dim=-1) + getitem_105 = None + float_49 = query_states_22.float() + mul_104 = float_49 * cos_25 + float_49 = None + x1_22 = query_states_22[(Ellipsis, slice(0, None, 2))] + x2_22 = query_states_22[(Ellipsis, slice(1, None, 2))] + query_states_22 = None + neg_22 = -x2_22 + x2_22 = None + stack_22 = torch.stack((neg_22, x1_22), dim=-1) + neg_22 = x1_22 = None + flatten_22 = stack_22.flatten(-2) + stack_22 = None + float_50 = flatten_22.float() + flatten_22 = None + mul_105 = float_50 * sin_25 + float_50 = None + q_embed_11 = mul_104 + mul_105 + mul_104 = mul_105 = None + float_51 = key_states_22.float() + mul_106 = float_51 * cos_25 + float_51 = cos_25 = None + x1_23 = key_states_22[(Ellipsis, slice(0, None, 2))] + x2_23 = key_states_22[(Ellipsis, slice(1, None, 2))] + key_states_22 = None + neg_23 = -x2_23 + x2_23 = None + stack_23 = torch.stack((neg_23, x1_23), dim=-1) + neg_23 = x1_23 = None + flatten_23 = stack_23.flatten(-2) + stack_23 = None + float_52 = flatten_23.float() + flatten_23 = None + mul_107 = float_52 * sin_25 + float_52 = sin_25 = None + k_embed_11 = mul_106 + mul_107 + mul_106 = mul_107 = None + query_states_23 = q_embed_11.to(torch.bfloat16) + q_embed_11 = None + key_states_23 = k_embed_11.to(torch.bfloat16) + k_embed_11 = None + getitem_110 = key_states_23[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_113 = getitem_110.expand(1, 2, 8, 2, 128) + getitem_110 = None + key_22 = hidden_states_113.reshape(1, 16, 2, 128) + hidden_states_113 = None + getitem_111 = value_states_11[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_114 = getitem_111.expand(1, 2, 8, 2, 128) + getitem_111 = None + value_22 = hidden_states_114.reshape(1, 16, 2, 128) + hidden_states_114 = None + attention_mask_12 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_11 = query_states_23.contiguous() + query_states_23 = None + key_23 = key_22.contiguous() + key_22 = None + value_23 = value_22.contiguous() + value_22 = None + attn_output_44 = torch._C._nn.scaled_dot_product_attention( + query_11, + key_23, + value_23, + attn_mask=attention_mask_12, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_11 = key_23 = value_23 = attention_mask_12 = None + transpose_48 = attn_output_44.transpose(1, 2) + attn_output_44 = None + attn_output_45 = transpose_48.contiguous() + transpose_48 = None + reshape_35 = attn_output_45.reshape(1, 2, -1) + attn_output_45 = None + attn_output_46 = reshape_35.contiguous() + reshape_35 = None + attn_output_47 = torch._C._nn.linear( + attn_output_46, + l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_46 = l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_115 = hidden_states_109 + attn_output_47 + hidden_states_109 = attn_output_47 = None + hidden_states_116 = hidden_states_115.to(torch.float32) + pow_24 = hidden_states_116.pow(2) + variance_23 = pow_24.mean(-1, keepdim=True) + pow_24 = None + add_70 = variance_23 + 1e-05 + variance_23 = None + rsqrt_23 = torch.rsqrt(add_70) + add_70 = None + hidden_states_117 = hidden_states_116 * rsqrt_23 + hidden_states_116 = rsqrt_23 = None + to_73 = hidden_states_117.to(torch.bfloat16) + hidden_states_117 = None + hidden_states_118 = ( + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + * to_73 + ) + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = ( + to_73 + ) = None + linear_81 = torch._C._nn.linear( + hidden_states_118, + l_self_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_11 = torch.nn.functional.silu(linear_81, inplace=False) + linear_81 = None + linear_82 = torch._C._nn.linear( + hidden_states_118, + l_self_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_118 = l_self_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_110 = silu_11 * linear_82 + silu_11 = linear_82 = None + down_proj_11 = torch._C._nn.linear( + mul_110, + l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_110 = l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_119 = hidden_states_115 + down_proj_11 + hidden_states_115 = down_proj_11 = None + hidden_states_120 = hidden_states_119.to(torch.float32) + pow_25 = hidden_states_120.pow(2) + variance_24 = pow_25.mean(-1, keepdim=True) + pow_25 = None + add_72 = variance_24 + 1e-05 + variance_24 = None + rsqrt_24 = torch.rsqrt(add_72) + add_72 = None + hidden_states_121 = hidden_states_120 * rsqrt_24 + hidden_states_120 = rsqrt_24 = None + to_75 = hidden_states_121.to(torch.bfloat16) + hidden_states_121 = None + hidden_states_122 = ( + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + * to_75 + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + to_75 + ) = None + linear_84 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_37 = linear_84.view((1, 2, -1, 128)) + linear_84 = None + query_states_24 = view_37.transpose(1, 2) + view_37 = None + linear_85 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_38 = linear_85.view((1, 2, -1, 128)) + linear_85 = None + key_states_24 = view_38.transpose(1, 2) + view_38 = None + linear_86 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_122 = l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_39 = linear_86.view((1, 2, -1, 128)) + linear_86 = None + value_states_12 = view_39.transpose(1, 2) + view_39 = None + cos_26 = cos_1.unsqueeze(1) + sin_26 = sin_1.unsqueeze(1) + getitem_113 = cos_26[(Ellipsis, slice(None, 64, None))] + cos_26 = None + cos_27 = getitem_113.repeat_interleave(2, dim=-1) + getitem_113 = None + getitem_114 = sin_26[(Ellipsis, slice(None, 64, None))] + sin_26 = None + sin_27 = getitem_114.repeat_interleave(2, dim=-1) + getitem_114 = None + float_53 = query_states_24.float() + mul_113 = float_53 * cos_27 + float_53 = None + x1_24 = query_states_24[(Ellipsis, slice(0, None, 2))] + x2_24 = query_states_24[(Ellipsis, slice(1, None, 2))] + query_states_24 = None + neg_24 = -x2_24 + x2_24 = None + stack_24 = torch.stack((neg_24, x1_24), dim=-1) + neg_24 = x1_24 = None + flatten_24 = stack_24.flatten(-2) + stack_24 = None + float_54 = flatten_24.float() + flatten_24 = None + mul_114 = float_54 * sin_27 + float_54 = None + q_embed_12 = mul_113 + mul_114 + mul_113 = mul_114 = None + float_55 = key_states_24.float() + mul_115 = float_55 * cos_27 + float_55 = cos_27 = None + x1_25 = key_states_24[(Ellipsis, slice(0, None, 2))] + x2_25 = key_states_24[(Ellipsis, slice(1, None, 2))] + key_states_24 = None + neg_25 = -x2_25 + x2_25 = None + stack_25 = torch.stack((neg_25, x1_25), dim=-1) + neg_25 = x1_25 = None + flatten_25 = stack_25.flatten(-2) + stack_25 = None + float_56 = flatten_25.float() + flatten_25 = None + mul_116 = float_56 * sin_27 + float_56 = sin_27 = None + k_embed_12 = mul_115 + mul_116 + mul_115 = mul_116 = None + query_states_25 = q_embed_12.to(torch.bfloat16) + q_embed_12 = None + key_states_25 = k_embed_12.to(torch.bfloat16) + k_embed_12 = None + getitem_119 = key_states_25[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_123 = getitem_119.expand(1, 2, 8, 2, 128) + getitem_119 = None + key_24 = hidden_states_123.reshape(1, 16, 2, 128) + hidden_states_123 = None + getitem_120 = value_states_12[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_124 = getitem_120.expand(1, 2, 8, 2, 128) + getitem_120 = None + value_24 = hidden_states_124.reshape(1, 16, 2, 128) + hidden_states_124 = None + attention_mask_13 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_12 = query_states_25.contiguous() + query_states_25 = None + key_25 = key_24.contiguous() + key_24 = None + value_25 = value_24.contiguous() + value_24 = None + attn_output_48 = torch._C._nn.scaled_dot_product_attention( + query_12, + key_25, + value_25, + attn_mask=attention_mask_13, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_12 = key_25 = value_25 = attention_mask_13 = None + transpose_52 = attn_output_48.transpose(1, 2) + attn_output_48 = None + attn_output_49 = transpose_52.contiguous() + transpose_52 = None + reshape_38 = attn_output_49.reshape(1, 2, -1) + attn_output_49 = None + attn_output_50 = reshape_38.contiguous() + reshape_38 = None + attn_output_51 = torch._C._nn.linear( + attn_output_50, + l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_50 = l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_125 = hidden_states_119 + attn_output_51 + hidden_states_119 = attn_output_51 = None + hidden_states_126 = hidden_states_125.to(torch.float32) + pow_26 = hidden_states_126.pow(2) + variance_25 = pow_26.mean(-1, keepdim=True) + pow_26 = None + add_76 = variance_25 + 1e-05 + variance_25 = None + rsqrt_25 = torch.rsqrt(add_76) + add_76 = None + hidden_states_127 = hidden_states_126 * rsqrt_25 + hidden_states_126 = rsqrt_25 = None + to_79 = hidden_states_127.to(torch.bfloat16) + hidden_states_127 = None + hidden_states_128 = ( + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + * to_79 + ) + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = ( + to_79 + ) = None + linear_88 = torch._C._nn.linear( + hidden_states_128, + l_self_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_12 = torch.nn.functional.silu(linear_88, inplace=False) + linear_88 = None + linear_89 = torch._C._nn.linear( + hidden_states_128, + l_self_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_128 = l_self_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_119 = silu_12 * linear_89 + silu_12 = linear_89 = None + down_proj_12 = torch._C._nn.linear( + mul_119, + l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_119 = l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_129 = hidden_states_125 + down_proj_12 + hidden_states_125 = down_proj_12 = None + hidden_states_130 = hidden_states_129.to(torch.float32) + pow_27 = hidden_states_130.pow(2) + variance_26 = pow_27.mean(-1, keepdim=True) + pow_27 = None + add_78 = variance_26 + 1e-05 + variance_26 = None + rsqrt_26 = torch.rsqrt(add_78) + add_78 = None + hidden_states_131 = hidden_states_130 * rsqrt_26 + hidden_states_130 = rsqrt_26 = None + to_81 = hidden_states_131.to(torch.bfloat16) + hidden_states_131 = None + hidden_states_132 = ( + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + * to_81 + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + to_81 + ) = None + linear_91 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_40 = linear_91.view((1, 2, -1, 128)) + linear_91 = None + query_states_26 = view_40.transpose(1, 2) + view_40 = None + linear_92 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_41 = linear_92.view((1, 2, -1, 128)) + linear_92 = None + key_states_26 = view_41.transpose(1, 2) + view_41 = None + linear_93 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_132 = l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_42 = linear_93.view((1, 2, -1, 128)) + linear_93 = None + value_states_13 = view_42.transpose(1, 2) + view_42 = None + cos_28 = cos_1.unsqueeze(1) + sin_28 = sin_1.unsqueeze(1) + getitem_122 = cos_28[(Ellipsis, slice(None, 64, None))] + cos_28 = None + cos_29 = getitem_122.repeat_interleave(2, dim=-1) + getitem_122 = None + getitem_123 = sin_28[(Ellipsis, slice(None, 64, None))] + sin_28 = None + sin_29 = getitem_123.repeat_interleave(2, dim=-1) + getitem_123 = None + float_57 = query_states_26.float() + mul_122 = float_57 * cos_29 + float_57 = None + x1_26 = query_states_26[(Ellipsis, slice(0, None, 2))] + x2_26 = query_states_26[(Ellipsis, slice(1, None, 2))] + query_states_26 = None + neg_26 = -x2_26 + x2_26 = None + stack_26 = torch.stack((neg_26, x1_26), dim=-1) + neg_26 = x1_26 = None + flatten_26 = stack_26.flatten(-2) + stack_26 = None + float_58 = flatten_26.float() + flatten_26 = None + mul_123 = float_58 * sin_29 + float_58 = None + q_embed_13 = mul_122 + mul_123 + mul_122 = mul_123 = None + float_59 = key_states_26.float() + mul_124 = float_59 * cos_29 + float_59 = cos_29 = None + x1_27 = key_states_26[(Ellipsis, slice(0, None, 2))] + x2_27 = key_states_26[(Ellipsis, slice(1, None, 2))] + key_states_26 = None + neg_27 = -x2_27 + x2_27 = None + stack_27 = torch.stack((neg_27, x1_27), dim=-1) + neg_27 = x1_27 = None + flatten_27 = stack_27.flatten(-2) + stack_27 = None + float_60 = flatten_27.float() + flatten_27 = None + mul_125 = float_60 * sin_29 + float_60 = sin_29 = None + k_embed_13 = mul_124 + mul_125 + mul_124 = mul_125 = None + query_states_27 = q_embed_13.to(torch.bfloat16) + q_embed_13 = None + key_states_27 = k_embed_13.to(torch.bfloat16) + k_embed_13 = None + getitem_128 = key_states_27[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_133 = getitem_128.expand(1, 2, 8, 2, 128) + getitem_128 = None + key_26 = hidden_states_133.reshape(1, 16, 2, 128) + hidden_states_133 = None + getitem_129 = value_states_13[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_134 = getitem_129.expand(1, 2, 8, 2, 128) + getitem_129 = None + value_26 = hidden_states_134.reshape(1, 16, 2, 128) + hidden_states_134 = None + attention_mask_14 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_13 = query_states_27.contiguous() + query_states_27 = None + key_27 = key_26.contiguous() + key_26 = None + value_27 = value_26.contiguous() + value_26 = None + attn_output_52 = torch._C._nn.scaled_dot_product_attention( + query_13, + key_27, + value_27, + attn_mask=attention_mask_14, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_13 = key_27 = value_27 = attention_mask_14 = None + transpose_56 = attn_output_52.transpose(1, 2) + attn_output_52 = None + attn_output_53 = transpose_56.contiguous() + transpose_56 = None + reshape_41 = attn_output_53.reshape(1, 2, -1) + attn_output_53 = None + attn_output_54 = reshape_41.contiguous() + reshape_41 = None + attn_output_55 = torch._C._nn.linear( + attn_output_54, + l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_54 = l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_135 = hidden_states_129 + attn_output_55 + hidden_states_129 = attn_output_55 = None + hidden_states_136 = hidden_states_135.to(torch.float32) + pow_28 = hidden_states_136.pow(2) + variance_27 = pow_28.mean(-1, keepdim=True) + pow_28 = None + add_82 = variance_27 + 1e-05 + variance_27 = None + rsqrt_27 = torch.rsqrt(add_82) + add_82 = None + hidden_states_137 = hidden_states_136 * rsqrt_27 + hidden_states_136 = rsqrt_27 = None + to_85 = hidden_states_137.to(torch.bfloat16) + hidden_states_137 = None + hidden_states_138 = ( + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + * to_85 + ) + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = ( + to_85 + ) = None + linear_95 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_13 = torch.nn.functional.silu(linear_95, inplace=False) + linear_95 = None + linear_96 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_138 = l_self_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_128 = silu_13 * linear_96 + silu_13 = linear_96 = None + down_proj_13 = torch._C._nn.linear( + mul_128, + l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_128 = l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_139 = hidden_states_135 + down_proj_13 + hidden_states_135 = down_proj_13 = None + hidden_states_140 = hidden_states_139.to(torch.float32) + pow_29 = hidden_states_140.pow(2) + variance_28 = pow_29.mean(-1, keepdim=True) + pow_29 = None + add_84 = variance_28 + 1e-05 + variance_28 = None + rsqrt_28 = torch.rsqrt(add_84) + add_84 = None + hidden_states_141 = hidden_states_140 * rsqrt_28 + hidden_states_140 = rsqrt_28 = None + to_87 = hidden_states_141.to(torch.bfloat16) + hidden_states_141 = None + hidden_states_142 = ( + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + * to_87 + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + to_87 + ) = None + linear_98 = torch._C._nn.linear( + hidden_states_142, + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_43 = linear_98.view((1, 2, -1, 128)) + linear_98 = None + query_states_28 = view_43.transpose(1, 2) + view_43 = None + linear_99 = torch._C._nn.linear( + hidden_states_142, + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_44 = linear_99.view((1, 2, -1, 128)) + linear_99 = None + key_states_28 = view_44.transpose(1, 2) + view_44 = None + linear_100 = torch._C._nn.linear( + hidden_states_142, + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_142 = l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_45 = linear_100.view((1, 2, -1, 128)) + linear_100 = None + value_states_14 = view_45.transpose(1, 2) + view_45 = None + cos_30 = cos_1.unsqueeze(1) + sin_30 = sin_1.unsqueeze(1) + getitem_131 = cos_30[(Ellipsis, slice(None, 64, None))] + cos_30 = None + cos_31 = getitem_131.repeat_interleave(2, dim=-1) + getitem_131 = None + getitem_132 = sin_30[(Ellipsis, slice(None, 64, None))] + sin_30 = None + sin_31 = getitem_132.repeat_interleave(2, dim=-1) + getitem_132 = None + float_61 = query_states_28.float() + mul_131 = float_61 * cos_31 + float_61 = None + x1_28 = query_states_28[(Ellipsis, slice(0, None, 2))] + x2_28 = query_states_28[(Ellipsis, slice(1, None, 2))] + query_states_28 = None + neg_28 = -x2_28 + x2_28 = None + stack_28 = torch.stack((neg_28, x1_28), dim=-1) + neg_28 = x1_28 = None + flatten_28 = stack_28.flatten(-2) + stack_28 = None + float_62 = flatten_28.float() + flatten_28 = None + mul_132 = float_62 * sin_31 + float_62 = None + q_embed_14 = mul_131 + mul_132 + mul_131 = mul_132 = None + float_63 = key_states_28.float() + mul_133 = float_63 * cos_31 + float_63 = cos_31 = None + x1_29 = key_states_28[(Ellipsis, slice(0, None, 2))] + x2_29 = key_states_28[(Ellipsis, slice(1, None, 2))] + key_states_28 = None + neg_29 = -x2_29 + x2_29 = None + stack_29 = torch.stack((neg_29, x1_29), dim=-1) + neg_29 = x1_29 = None + flatten_29 = stack_29.flatten(-2) + stack_29 = None + float_64 = flatten_29.float() + flatten_29 = None + mul_134 = float_64 * sin_31 + float_64 = sin_31 = None + k_embed_14 = mul_133 + mul_134 + mul_133 = mul_134 = None + query_states_29 = q_embed_14.to(torch.bfloat16) + q_embed_14 = None + key_states_29 = k_embed_14.to(torch.bfloat16) + k_embed_14 = None + getitem_137 = key_states_29[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_143 = getitem_137.expand(1, 2, 8, 2, 128) + getitem_137 = None + key_28 = hidden_states_143.reshape(1, 16, 2, 128) + hidden_states_143 = None + getitem_138 = value_states_14[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_144 = getitem_138.expand(1, 2, 8, 2, 128) + getitem_138 = None + value_28 = hidden_states_144.reshape(1, 16, 2, 128) + hidden_states_144 = None + attention_mask_15 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_14 = query_states_29.contiguous() + query_states_29 = None + key_29 = key_28.contiguous() + key_28 = None + value_29 = value_28.contiguous() + value_28 = None + attn_output_56 = torch._C._nn.scaled_dot_product_attention( + query_14, + key_29, + value_29, + attn_mask=attention_mask_15, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_14 = key_29 = value_29 = attention_mask_15 = None + transpose_60 = attn_output_56.transpose(1, 2) + attn_output_56 = None + attn_output_57 = transpose_60.contiguous() + transpose_60 = None + reshape_44 = attn_output_57.reshape(1, 2, -1) + attn_output_57 = None + attn_output_58 = reshape_44.contiguous() + reshape_44 = None + attn_output_59 = torch._C._nn.linear( + attn_output_58, + l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_58 = l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_145 = hidden_states_139 + attn_output_59 + hidden_states_139 = attn_output_59 = None + hidden_states_146 = hidden_states_145.to(torch.float32) + pow_30 = hidden_states_146.pow(2) + variance_29 = pow_30.mean(-1, keepdim=True) + pow_30 = None + add_88 = variance_29 + 1e-05 + variance_29 = None + rsqrt_29 = torch.rsqrt(add_88) + add_88 = None + hidden_states_147 = hidden_states_146 * rsqrt_29 + hidden_states_146 = rsqrt_29 = None + to_91 = hidden_states_147.to(torch.bfloat16) + hidden_states_147 = None + hidden_states_148 = ( + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + * to_91 + ) + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = ( + to_91 + ) = None + linear_102 = torch._C._nn.linear( + hidden_states_148, + l_self_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_14 = torch.nn.functional.silu(linear_102, inplace=False) + linear_102 = None + linear_103 = torch._C._nn.linear( + hidden_states_148, + l_self_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_148 = l_self_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_137 = silu_14 * linear_103 + silu_14 = linear_103 = None + down_proj_14 = torch._C._nn.linear( + mul_137, + l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_137 = l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_149 = hidden_states_145 + down_proj_14 + hidden_states_145 = down_proj_14 = None + hidden_states_150 = hidden_states_149.to(torch.float32) + pow_31 = hidden_states_150.pow(2) + variance_30 = pow_31.mean(-1, keepdim=True) + pow_31 = None + add_90 = variance_30 + 1e-05 + variance_30 = None + rsqrt_30 = torch.rsqrt(add_90) + add_90 = None + hidden_states_151 = hidden_states_150 * rsqrt_30 + hidden_states_150 = rsqrt_30 = None + to_93 = hidden_states_151.to(torch.bfloat16) + hidden_states_151 = None + hidden_states_152 = ( + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + * to_93 + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + to_93 + ) = None + linear_105 = torch._C._nn.linear( + hidden_states_152, + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_46 = linear_105.view((1, 2, -1, 128)) + linear_105 = None + query_states_30 = view_46.transpose(1, 2) + view_46 = None + linear_106 = torch._C._nn.linear( + hidden_states_152, + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_47 = linear_106.view((1, 2, -1, 128)) + linear_106 = None + key_states_30 = view_47.transpose(1, 2) + view_47 = None + linear_107 = torch._C._nn.linear( + hidden_states_152, + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_152 = l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_48 = linear_107.view((1, 2, -1, 128)) + linear_107 = None + value_states_15 = view_48.transpose(1, 2) + view_48 = None + cos_32 = cos_1.unsqueeze(1) + sin_32 = sin_1.unsqueeze(1) + getitem_140 = cos_32[(Ellipsis, slice(None, 64, None))] + cos_32 = None + cos_33 = getitem_140.repeat_interleave(2, dim=-1) + getitem_140 = None + getitem_141 = sin_32[(Ellipsis, slice(None, 64, None))] + sin_32 = None + sin_33 = getitem_141.repeat_interleave(2, dim=-1) + getitem_141 = None + float_65 = query_states_30.float() + mul_140 = float_65 * cos_33 + float_65 = None + x1_30 = query_states_30[(Ellipsis, slice(0, None, 2))] + x2_30 = query_states_30[(Ellipsis, slice(1, None, 2))] + query_states_30 = None + neg_30 = -x2_30 + x2_30 = None + stack_30 = torch.stack((neg_30, x1_30), dim=-1) + neg_30 = x1_30 = None + flatten_30 = stack_30.flatten(-2) + stack_30 = None + float_66 = flatten_30.float() + flatten_30 = None + mul_141 = float_66 * sin_33 + float_66 = None + q_embed_15 = mul_140 + mul_141 + mul_140 = mul_141 = None + float_67 = key_states_30.float() + mul_142 = float_67 * cos_33 + float_67 = cos_33 = None + x1_31 = key_states_30[(Ellipsis, slice(0, None, 2))] + x2_31 = key_states_30[(Ellipsis, slice(1, None, 2))] + key_states_30 = None + neg_31 = -x2_31 + x2_31 = None + stack_31 = torch.stack((neg_31, x1_31), dim=-1) + neg_31 = x1_31 = None + flatten_31 = stack_31.flatten(-2) + stack_31 = None + float_68 = flatten_31.float() + flatten_31 = None + mul_143 = float_68 * sin_33 + float_68 = sin_33 = None + k_embed_15 = mul_142 + mul_143 + mul_142 = mul_143 = None + query_states_31 = q_embed_15.to(torch.bfloat16) + q_embed_15 = None + key_states_31 = k_embed_15.to(torch.bfloat16) + k_embed_15 = None + getitem_146 = key_states_31[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_153 = getitem_146.expand(1, 2, 8, 2, 128) + getitem_146 = None + key_30 = hidden_states_153.reshape(1, 16, 2, 128) + hidden_states_153 = None + getitem_147 = value_states_15[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_154 = getitem_147.expand(1, 2, 8, 2, 128) + getitem_147 = None + value_30 = hidden_states_154.reshape(1, 16, 2, 128) + hidden_states_154 = None + attention_mask_16 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_15 = query_states_31.contiguous() + query_states_31 = None + key_31 = key_30.contiguous() + key_30 = None + value_31 = value_30.contiguous() + value_30 = None + attn_output_60 = torch._C._nn.scaled_dot_product_attention( + query_15, + key_31, + value_31, + attn_mask=attention_mask_16, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_15 = key_31 = value_31 = attention_mask_16 = None + transpose_64 = attn_output_60.transpose(1, 2) + attn_output_60 = None + attn_output_61 = transpose_64.contiguous() + transpose_64 = None + reshape_47 = attn_output_61.reshape(1, 2, -1) + attn_output_61 = None + attn_output_62 = reshape_47.contiguous() + reshape_47 = None + attn_output_63 = torch._C._nn.linear( + attn_output_62, + l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_62 = l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_155 = hidden_states_149 + attn_output_63 + hidden_states_149 = attn_output_63 = None + hidden_states_156 = hidden_states_155.to(torch.float32) + pow_32 = hidden_states_156.pow(2) + variance_31 = pow_32.mean(-1, keepdim=True) + pow_32 = None + add_94 = variance_31 + 1e-05 + variance_31 = None + rsqrt_31 = torch.rsqrt(add_94) + add_94 = None + hidden_states_157 = hidden_states_156 * rsqrt_31 + hidden_states_156 = rsqrt_31 = None + to_97 = hidden_states_157.to(torch.bfloat16) + hidden_states_157 = None + hidden_states_158 = ( + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + * to_97 + ) + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = ( + to_97 + ) = None + linear_109 = torch._C._nn.linear( + hidden_states_158, + l_self_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_15 = torch.nn.functional.silu(linear_109, inplace=False) + linear_109 = None + linear_110 = torch._C._nn.linear( + hidden_states_158, + l_self_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_158 = l_self_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_146 = silu_15 * linear_110 + silu_15 = linear_110 = None + down_proj_15 = torch._C._nn.linear( + mul_146, + l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_146 = l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_159 = hidden_states_155 + down_proj_15 + hidden_states_155 = down_proj_15 = None + hidden_states_160 = hidden_states_159.to(torch.float32) + pow_33 = hidden_states_160.pow(2) + variance_32 = pow_33.mean(-1, keepdim=True) + pow_33 = None + add_96 = variance_32 + 1e-05 + variance_32 = None + rsqrt_32 = torch.rsqrt(add_96) + add_96 = None + hidden_states_161 = hidden_states_160 * rsqrt_32 + hidden_states_160 = rsqrt_32 = None + to_99 = hidden_states_161.to(torch.bfloat16) + hidden_states_161 = None + hidden_states_162 = ( + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + * to_99 + ) + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + to_99 + ) = None + linear_112 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_49 = linear_112.view((1, 2, -1, 128)) + linear_112 = None + query_states_32 = view_49.transpose(1, 2) + view_49 = None + linear_113 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_50 = linear_113.view((1, 2, -1, 128)) + linear_113 = None + key_states_32 = view_50.transpose(1, 2) + view_50 = None + linear_114 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_162 = l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_51 = linear_114.view((1, 2, -1, 128)) + linear_114 = None + value_states_16 = view_51.transpose(1, 2) + view_51 = None + cos_34 = cos_1.unsqueeze(1) + sin_34 = sin_1.unsqueeze(1) + getitem_149 = cos_34[(Ellipsis, slice(None, 64, None))] + cos_34 = None + cos_35 = getitem_149.repeat_interleave(2, dim=-1) + getitem_149 = None + getitem_150 = sin_34[(Ellipsis, slice(None, 64, None))] + sin_34 = None + sin_35 = getitem_150.repeat_interleave(2, dim=-1) + getitem_150 = None + float_69 = query_states_32.float() + mul_149 = float_69 * cos_35 + float_69 = None + x1_32 = query_states_32[(Ellipsis, slice(0, None, 2))] + x2_32 = query_states_32[(Ellipsis, slice(1, None, 2))] + query_states_32 = None + neg_32 = -x2_32 + x2_32 = None + stack_32 = torch.stack((neg_32, x1_32), dim=-1) + neg_32 = x1_32 = None + flatten_32 = stack_32.flatten(-2) + stack_32 = None + float_70 = flatten_32.float() + flatten_32 = None + mul_150 = float_70 * sin_35 + float_70 = None + q_embed_16 = mul_149 + mul_150 + mul_149 = mul_150 = None + float_71 = key_states_32.float() + mul_151 = float_71 * cos_35 + float_71 = cos_35 = None + x1_33 = key_states_32[(Ellipsis, slice(0, None, 2))] + x2_33 = key_states_32[(Ellipsis, slice(1, None, 2))] + key_states_32 = None + neg_33 = -x2_33 + x2_33 = None + stack_33 = torch.stack((neg_33, x1_33), dim=-1) + neg_33 = x1_33 = None + flatten_33 = stack_33.flatten(-2) + stack_33 = None + float_72 = flatten_33.float() + flatten_33 = None + mul_152 = float_72 * sin_35 + float_72 = sin_35 = None + k_embed_16 = mul_151 + mul_152 + mul_151 = mul_152 = None + query_states_33 = q_embed_16.to(torch.bfloat16) + q_embed_16 = None + key_states_33 = k_embed_16.to(torch.bfloat16) + k_embed_16 = None + getitem_155 = key_states_33[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_163 = getitem_155.expand(1, 2, 8, 2, 128) + getitem_155 = None + key_32 = hidden_states_163.reshape(1, 16, 2, 128) + hidden_states_163 = None + getitem_156 = value_states_16[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_164 = getitem_156.expand(1, 2, 8, 2, 128) + getitem_156 = None + value_32 = hidden_states_164.reshape(1, 16, 2, 128) + hidden_states_164 = None + attention_mask_17 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_16 = query_states_33.contiguous() + query_states_33 = None + key_33 = key_32.contiguous() + key_32 = None + value_33 = value_32.contiguous() + value_32 = None + attn_output_64 = torch._C._nn.scaled_dot_product_attention( + query_16, + key_33, + value_33, + attn_mask=attention_mask_17, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_16 = key_33 = value_33 = attention_mask_17 = None + transpose_68 = attn_output_64.transpose(1, 2) + attn_output_64 = None + attn_output_65 = transpose_68.contiguous() + transpose_68 = None + reshape_50 = attn_output_65.reshape(1, 2, -1) + attn_output_65 = None + attn_output_66 = reshape_50.contiguous() + reshape_50 = None + attn_output_67 = torch._C._nn.linear( + attn_output_66, + l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_66 = l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_165 = hidden_states_159 + attn_output_67 + hidden_states_159 = attn_output_67 = None + hidden_states_166 = hidden_states_165.to(torch.float32) + pow_34 = hidden_states_166.pow(2) + variance_33 = pow_34.mean(-1, keepdim=True) + pow_34 = None + add_100 = variance_33 + 1e-05 + variance_33 = None + rsqrt_33 = torch.rsqrt(add_100) + add_100 = None + hidden_states_167 = hidden_states_166 * rsqrt_33 + hidden_states_166 = rsqrt_33 = None + to_103 = hidden_states_167.to(torch.bfloat16) + hidden_states_167 = None + hidden_states_168 = ( + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + * to_103 + ) + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = ( + to_103 + ) = None + linear_116 = torch._C._nn.linear( + hidden_states_168, + l_self_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_16 = torch.nn.functional.silu(linear_116, inplace=False) + linear_116 = None + linear_117 = torch._C._nn.linear( + hidden_states_168, + l_self_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_168 = l_self_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_155 = silu_16 * linear_117 + silu_16 = linear_117 = None + down_proj_16 = torch._C._nn.linear( + mul_155, + l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_155 = l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_169 = hidden_states_165 + down_proj_16 + hidden_states_165 = down_proj_16 = None + hidden_states_170 = hidden_states_169.to(torch.float32) + pow_35 = hidden_states_170.pow(2) + variance_34 = pow_35.mean(-1, keepdim=True) + pow_35 = None + add_102 = variance_34 + 1e-05 + variance_34 = None + rsqrt_34 = torch.rsqrt(add_102) + add_102 = None + hidden_states_171 = hidden_states_170 * rsqrt_34 + hidden_states_170 = rsqrt_34 = None + to_105 = hidden_states_171.to(torch.bfloat16) + hidden_states_171 = None + hidden_states_172 = ( + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + * to_105 + ) + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + to_105 + ) = None + linear_119 = torch._C._nn.linear( + hidden_states_172, + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_52 = linear_119.view((1, 2, -1, 128)) + linear_119 = None + query_states_34 = view_52.transpose(1, 2) + view_52 = None + linear_120 = torch._C._nn.linear( + hidden_states_172, + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_53 = linear_120.view((1, 2, -1, 128)) + linear_120 = None + key_states_34 = view_53.transpose(1, 2) + view_53 = None + linear_121 = torch._C._nn.linear( + hidden_states_172, + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_172 = l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_54 = linear_121.view((1, 2, -1, 128)) + linear_121 = None + value_states_17 = view_54.transpose(1, 2) + view_54 = None + cos_36 = cos_1.unsqueeze(1) + cos_1 = None + sin_36 = sin_1.unsqueeze(1) + sin_1 = None + getitem_158 = cos_36[(Ellipsis, slice(None, 64, None))] + cos_36 = None + cos_37 = getitem_158.repeat_interleave(2, dim=-1) + getitem_158 = None + getitem_159 = sin_36[(Ellipsis, slice(None, 64, None))] + sin_36 = None + sin_37 = getitem_159.repeat_interleave(2, dim=-1) + getitem_159 = None + float_73 = query_states_34.float() + mul_158 = float_73 * cos_37 + float_73 = None + x1_34 = query_states_34[(Ellipsis, slice(0, None, 2))] + x2_34 = query_states_34[(Ellipsis, slice(1, None, 2))] + query_states_34 = None + neg_34 = -x2_34 + x2_34 = None + stack_34 = torch.stack((neg_34, x1_34), dim=-1) + neg_34 = x1_34 = None + flatten_34 = stack_34.flatten(-2) + stack_34 = None + float_74 = flatten_34.float() + flatten_34 = None + mul_159 = float_74 * sin_37 + float_74 = None + q_embed_17 = mul_158 + mul_159 + mul_158 = mul_159 = None + float_75 = key_states_34.float() + mul_160 = float_75 * cos_37 + float_75 = cos_37 = None + x1_35 = key_states_34[(Ellipsis, slice(0, None, 2))] + x2_35 = key_states_34[(Ellipsis, slice(1, None, 2))] + key_states_34 = None + neg_35 = -x2_35 + x2_35 = None + stack_35 = torch.stack((neg_35, x1_35), dim=-1) + neg_35 = x1_35 = None + flatten_35 = stack_35.flatten(-2) + stack_35 = None + float_76 = flatten_35.float() + flatten_35 = None + mul_161 = float_76 * sin_37 + float_76 = sin_37 = None + k_embed_17 = mul_160 + mul_161 + mul_160 = mul_161 = None + query_states_35 = q_embed_17.to(torch.bfloat16) + q_embed_17 = None + key_states_35 = k_embed_17.to(torch.bfloat16) + k_embed_17 = None + getitem_164 = key_states_35[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_173 = getitem_164.expand(1, 2, 8, 2, 128) + getitem_164 = None + key_34 = hidden_states_173.reshape(1, 16, 2, 128) + hidden_states_173 = None + getitem_165 = value_states_17[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_174 = getitem_165.expand(1, 2, 8, 2, 128) + getitem_165 = None + value_34 = hidden_states_174.reshape(1, 16, 2, 128) + hidden_states_174 = None + attention_mask_18 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + causal_mask_2 = None + query_17 = query_states_35.contiguous() + query_states_35 = None + key_35 = key_34.contiguous() + key_34 = None + value_35 = value_34.contiguous() + value_34 = None + attn_output_68 = torch._C._nn.scaled_dot_product_attention( + query_17, + key_35, + value_35, + attn_mask=attention_mask_18, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_17 = key_35 = value_35 = attention_mask_18 = None + transpose_72 = attn_output_68.transpose(1, 2) + attn_output_68 = None + attn_output_69 = transpose_72.contiguous() + transpose_72 = None + reshape_53 = attn_output_69.reshape(1, 2, -1) + attn_output_69 = None + attn_output_70 = reshape_53.contiguous() + reshape_53 = None + attn_output_71 = torch._C._nn.linear( + attn_output_70, + l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_70 = l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_175 = hidden_states_169 + attn_output_71 + hidden_states_169 = attn_output_71 = None + hidden_states_176 = hidden_states_175.to(torch.float32) + pow_36 = hidden_states_176.pow(2) + variance_35 = pow_36.mean(-1, keepdim=True) + pow_36 = None + add_106 = variance_35 + 1e-05 + variance_35 = None + rsqrt_35 = torch.rsqrt(add_106) + add_106 = None + hidden_states_177 = hidden_states_176 * rsqrt_35 + hidden_states_176 = rsqrt_35 = None + to_109 = hidden_states_177.to(torch.bfloat16) + hidden_states_177 = None + hidden_states_178 = ( + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + * to_109 + ) + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = ( + to_109 + ) = None + linear_123 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_17 = torch.nn.functional.silu(linear_123, inplace=False) + linear_123 = None + linear_124 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_178 = l_self_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_164 = silu_17 * linear_124 + silu_17 = linear_124 = None + down_proj_17 = torch._C._nn.linear( + mul_164, + l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_164 = l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_179 = hidden_states_175 + down_proj_17 + hidden_states_175 = down_proj_17 = None + hidden_states_180 = hidden_states_179.to(torch.float32) + hidden_states_179 = None + pow_37 = hidden_states_180.pow(2) + variance_36 = pow_37.mean(-1, keepdim=True) + pow_37 = None + add_108 = variance_36 + 1e-05 + variance_36 = None + rsqrt_36 = torch.rsqrt(add_108) + add_108 = None + hidden_states_181 = hidden_states_180 * rsqrt_36 + hidden_states_180 = rsqrt_36 = None + to_111 = hidden_states_181.to(torch.bfloat16) + hidden_states_181 = None + hidden_states_182 = l_self_modules_norm_parameters_weight_ * to_111 + l_self_modules_norm_parameters_weight_ = to_111 = None + return ( + value_states, + key_states_1, + value_states_1, + key_states_3, + value_states_2, + key_states_5, + value_states_3, + key_states_7, + value_states_4, + key_states_9, + value_states_5, + key_states_11, + value_states_6, + key_states_13, + value_states_7, + key_states_15, + value_states_8, + key_states_17, + value_states_9, + key_states_19, + value_states_10, + key_states_21, + value_states_11, + key_states_23, + value_states_12, + key_states_25, + value_states_13, + key_states_27, + value_states_14, + key_states_29, + value_states_15, + key_states_31, + value_states_16, + key_states_33, + value_states_17, + key_states_35, + hidden_states_182, + ) diff --git a/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/weight_meta.py b/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/weight_meta.py new file mode 100644 index 000000000..74d37e192 --- /dev/null +++ b/samples/transformers-auto-model/baidu/ERNIE-4.5-0.3B-PT/weight_meta.py @@ -0,0 +1,1678 @@ +class Program_weight_tensor_meta_L_inputs_embeds_: + name = "L_inputs_embeds_" + shape = [1, 2, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 2] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1] + + +class Program_weight_tensor_meta_L_self_modules_rotary_emb_buffers_inv_freq_: + name = "L_self_modules_rotary_emb_buffers_inv_freq_" + shape = [64] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.084 + std = 0.200 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_norm_parameters_weight_: + name = "L_self_modules_norm_parameters_weight_" + shape = [1024] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/graph_hash.txt b/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/graph_hash.txt new file mode 100644 index 000000000..63a79d2fb --- /dev/null +++ b/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/graph_hash.txt @@ -0,0 +1 @@ +5ea54bb8f19e3176bee13f08a8ffa6d2273f53ae03f215e7b14c29a297972c91 \ No newline at end of file diff --git a/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/graph_net.json b/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/input_meta.py b/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/input_tensor_constraints.py b/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/model.py b/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/model.py new file mode 100644 index 000000000..63e4216fe --- /dev/null +++ b/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/model.py @@ -0,0 +1,4555 @@ +import torch + +from torch import device + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_inputs_embeds_: torch.Tensor, + L_attention_mask_: torch.Tensor, + L_self_modules_rotary_emb_buffers_inv_freq_: torch.Tensor, + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_norm_parameters_weight_: torch.nn.parameter.Parameter, + ): + l_inputs_embeds_ = L_inputs_embeds_ + l_attention_mask_ = L_attention_mask_ + l_self_modules_rotary_emb_buffers_inv_freq_ = ( + L_self_modules_rotary_emb_buffers_inv_freq_ + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_norm_parameters_weight_ = L_self_modules_norm_parameters_weight_ + cache_position = torch.arange(0, 3, device=device(type="cuda", index=0)) + position_ids = cache_position.unsqueeze(0) + attention_mask = l_attention_mask_.to( + device=device(type="cuda", index=0), dtype=torch.bool + ) + l_attention_mask_ = None + mask_indices = torch.arange(3, device=device(type="cuda", index=0)) + mask_indices += 0 + mask_indices_1 = mask_indices + mask_indices = None + local_padding_mask = attention_mask[(slice(None, None, None), mask_indices_1)] + attention_mask = mask_indices_1 = None + kv_arange = torch.arange(3, device=device(type="cuda", index=0)) + kv_arange += 0 + kv_arange_1 = kv_arange + kv_arange = None + reshaped_cache_position = cache_position.view(-1, 1) + cache_position = None + causal_mask = kv_arange_1 <= reshaped_cache_position + kv_arange_1 = reshaped_cache_position = None + getitem_1 = causal_mask[ + (None, None, slice(None, None, None), slice(None, None, None)) + ] + causal_mask = None + causal_mask_1 = getitem_1.expand(1, -1, -1, -1) + getitem_1 = None + getitem_2 = local_padding_mask[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + local_padding_mask = None + causal_mask_2 = causal_mask_1 * getitem_2 + causal_mask_1 = getitem_2 = None + _set_grad_enabled = torch._C._set_grad_enabled(False) + _set_grad_enabled = None + getitem_3 = l_self_modules_rotary_emb_buffers_inv_freq_[ + (None, slice(None, None, None), None) + ] + l_self_modules_rotary_emb_buffers_inv_freq_ = None + float_1 = getitem_3.float() + getitem_3 = None + expand_1 = float_1.expand(1, -1, 1) + float_1 = None + inv_freq_expanded = expand_1.to(device(type="cuda", index=0)) + expand_1 = None + getitem_4 = position_ids[ + (slice(None, None, None), None, slice(None, None, None)) + ] + position_ids = None + position_ids_expanded = getitem_4.float() + getitem_4 = None + float_3 = inv_freq_expanded.float() + inv_freq_expanded = None + float_4 = position_ids_expanded.float() + position_ids_expanded = None + matmul = float_3 @ float_4 + float_3 = float_4 = None + freqs = matmul.transpose(1, 2) + matmul = None + emb = torch.cat((freqs, freqs), dim=-1) + freqs = None + cos = emb.cos() + cos_1 = cos * 1.0 + cos = None + sin = emb.sin() + emb = None + sin_1 = sin * 1.0 + sin = None + cos_2 = cos_1.to(dtype=torch.bfloat16) + cos_1 = None + sin_2 = sin_1.to(dtype=torch.bfloat16) + sin_1 = None + _set_grad_enabled_1 = torch._C._set_grad_enabled(True) + _set_grad_enabled_1 = None + _log_api_usage_once = torch._C._log_api_usage_once("python.nn_module") + _log_api_usage_once = None + hidden_states = l_inputs_embeds_.to(torch.float32) + pow_1 = hidden_states.pow(2) + variance = pow_1.mean(-1, keepdim=True) + pow_1 = None + add = variance + 1e-06 + variance = None + rsqrt = torch.rsqrt(add) + add = None + hidden_states_1 = hidden_states * rsqrt + hidden_states = rsqrt = None + to_5 = hidden_states_1.to(torch.bfloat16) + hidden_states_1 = None + hidden_states_2 = ( + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + * to_5 + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + to_5 + ) = None + linear = torch._C._nn.linear( + hidden_states_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_1 = linear.view((1, 3, -1, 128)) + linear = None + query_states = view_1.transpose(1, 2) + view_1 = None + linear_1 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_2 = linear_1.view((1, 3, -1, 128)) + linear_1 = None + key_states = view_2.transpose(1, 2) + view_2 = None + linear_2 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_2 = l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_3 = linear_2.view((1, 3, -1, 128)) + linear_2 = None + value_states = view_3.transpose(1, 2) + view_3 = None + cos_3 = cos_2.unsqueeze(1) + sin_3 = sin_2.unsqueeze(1) + mul_5 = query_states * cos_3 + x1 = query_states[(Ellipsis, slice(None, 64, None))] + x2 = query_states[(Ellipsis, slice(64, None, None))] + query_states = None + neg = -x2 + x2 = None + cat_1 = torch.cat((neg, x1), dim=-1) + neg = x1 = None + mul_6 = cat_1 * sin_3 + cat_1 = None + q_embed = mul_5 + mul_6 + mul_5 = mul_6 = None + mul_7 = key_states * cos_3 + cos_3 = None + x1_1 = key_states[(Ellipsis, slice(None, 64, None))] + x2_1 = key_states[(Ellipsis, slice(64, None, None))] + key_states = None + neg_1 = -x2_1 + x2_1 = None + cat_2 = torch.cat((neg_1, x1_1), dim=-1) + neg_1 = x1_1 = None + mul_8 = cat_2 * sin_3 + cat_2 = sin_3 = None + k_embed = mul_7 + mul_8 + mul_7 = mul_8 = None + attention_mask_1 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query = q_embed.contiguous() + q_embed = None + key = k_embed.contiguous() + value = value_states.contiguous() + attn_output = torch._C._nn.scaled_dot_product_attention( + query, + key, + value, + attn_mask=attention_mask_1, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query = key = value = attention_mask_1 = None + transpose_4 = attn_output.transpose(1, 2) + attn_output = None + attn_output_1 = transpose_4.contiguous() + transpose_4 = None + reshape = attn_output_1.reshape(1, 3, -1) + attn_output_1 = None + attn_output_2 = reshape.contiguous() + reshape = None + attn_output_3 = torch._C._nn.linear( + attn_output_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_2 = l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_3 = l_inputs_embeds_ + attn_output_3 + l_inputs_embeds_ = attn_output_3 = None + hidden_states_4 = hidden_states_3.to(torch.float32) + pow_2 = hidden_states_4.pow(2) + variance_1 = pow_2.mean(-1, keepdim=True) + pow_2 = None + add_4 = variance_1 + 1e-06 + variance_1 = None + rsqrt_1 = torch.rsqrt(add_4) + add_4 = None + hidden_states_5 = hidden_states_4 * rsqrt_1 + hidden_states_4 = rsqrt_1 = None + to_7 = hidden_states_5.to(torch.bfloat16) + hidden_states_5 = None + hidden_states_6 = ( + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + * to_7 + ) + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = ( + to_7 + ) = None + linear_4 = torch._C._nn.linear( + hidden_states_6, + l_self_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu = torch.nn.functional.silu(linear_4, inplace=False) + linear_4 = None + linear_5 = torch._C._nn.linear( + hidden_states_6, + l_self_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_6 = l_self_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_11 = silu * linear_5 + silu = linear_5 = None + down_proj = torch._C._nn.linear( + mul_11, + l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_11 = l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_7 = hidden_states_3 + down_proj + hidden_states_3 = down_proj = None + hidden_states_8 = hidden_states_7.to(torch.float32) + pow_3 = hidden_states_8.pow(2) + variance_2 = pow_3.mean(-1, keepdim=True) + pow_3 = None + add_6 = variance_2 + 1e-06 + variance_2 = None + rsqrt_2 = torch.rsqrt(add_6) + add_6 = None + hidden_states_9 = hidden_states_8 * rsqrt_2 + hidden_states_8 = rsqrt_2 = None + to_9 = hidden_states_9.to(torch.bfloat16) + hidden_states_9 = None + hidden_states_10 = ( + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + * to_9 + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + to_9 + ) = None + linear_7 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_4 = linear_7.view((1, 3, -1, 128)) + linear_7 = None + query_states_1 = view_4.transpose(1, 2) + view_4 = None + linear_8 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_5 = linear_8.view((1, 3, -1, 128)) + linear_8 = None + key_states_1 = view_5.transpose(1, 2) + view_5 = None + linear_9 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_10 = l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_6 = linear_9.view((1, 3, -1, 128)) + linear_9 = None + value_states_1 = view_6.transpose(1, 2) + view_6 = None + cos_4 = cos_2.unsqueeze(1) + sin_4 = sin_2.unsqueeze(1) + mul_14 = query_states_1 * cos_4 + x1_2 = query_states_1[(Ellipsis, slice(None, 64, None))] + x2_2 = query_states_1[(Ellipsis, slice(64, None, None))] + query_states_1 = None + neg_2 = -x2_2 + x2_2 = None + cat_3 = torch.cat((neg_2, x1_2), dim=-1) + neg_2 = x1_2 = None + mul_15 = cat_3 * sin_4 + cat_3 = None + q_embed_1 = mul_14 + mul_15 + mul_14 = mul_15 = None + mul_16 = key_states_1 * cos_4 + cos_4 = None + x1_3 = key_states_1[(Ellipsis, slice(None, 64, None))] + x2_3 = key_states_1[(Ellipsis, slice(64, None, None))] + key_states_1 = None + neg_3 = -x2_3 + x2_3 = None + cat_4 = torch.cat((neg_3, x1_3), dim=-1) + neg_3 = x1_3 = None + mul_17 = cat_4 * sin_4 + cat_4 = sin_4 = None + k_embed_1 = mul_16 + mul_17 + mul_16 = mul_17 = None + attention_mask_2 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_1 = q_embed_1.contiguous() + q_embed_1 = None + key_1 = k_embed_1.contiguous() + value_1 = value_states_1.contiguous() + attn_output_4 = torch._C._nn.scaled_dot_product_attention( + query_1, + key_1, + value_1, + attn_mask=attention_mask_2, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_1 = key_1 = value_1 = attention_mask_2 = None + transpose_8 = attn_output_4.transpose(1, 2) + attn_output_4 = None + attn_output_5 = transpose_8.contiguous() + transpose_8 = None + reshape_1 = attn_output_5.reshape(1, 3, -1) + attn_output_5 = None + attn_output_6 = reshape_1.contiguous() + reshape_1 = None + attn_output_7 = torch._C._nn.linear( + attn_output_6, + l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_6 = l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_11 = hidden_states_7 + attn_output_7 + hidden_states_7 = attn_output_7 = None + hidden_states_12 = hidden_states_11.to(torch.float32) + pow_4 = hidden_states_12.pow(2) + variance_3 = pow_4.mean(-1, keepdim=True) + pow_4 = None + add_10 = variance_3 + 1e-06 + variance_3 = None + rsqrt_3 = torch.rsqrt(add_10) + add_10 = None + hidden_states_13 = hidden_states_12 * rsqrt_3 + hidden_states_12 = rsqrt_3 = None + to_11 = hidden_states_13.to(torch.bfloat16) + hidden_states_13 = None + hidden_states_14 = ( + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + * to_11 + ) + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = ( + to_11 + ) = None + linear_11 = torch._C._nn.linear( + hidden_states_14, + l_self_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_1 = torch.nn.functional.silu(linear_11, inplace=False) + linear_11 = None + linear_12 = torch._C._nn.linear( + hidden_states_14, + l_self_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_14 = l_self_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_20 = silu_1 * linear_12 + silu_1 = linear_12 = None + down_proj_1 = torch._C._nn.linear( + mul_20, + l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_20 = l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_15 = hidden_states_11 + down_proj_1 + hidden_states_11 = down_proj_1 = None + hidden_states_16 = hidden_states_15.to(torch.float32) + pow_5 = hidden_states_16.pow(2) + variance_4 = pow_5.mean(-1, keepdim=True) + pow_5 = None + add_12 = variance_4 + 1e-06 + variance_4 = None + rsqrt_4 = torch.rsqrt(add_12) + add_12 = None + hidden_states_17 = hidden_states_16 * rsqrt_4 + hidden_states_16 = rsqrt_4 = None + to_13 = hidden_states_17.to(torch.bfloat16) + hidden_states_17 = None + hidden_states_18 = ( + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + * to_13 + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + to_13 + ) = None + linear_14 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_7 = linear_14.view((1, 3, -1, 128)) + linear_14 = None + query_states_2 = view_7.transpose(1, 2) + view_7 = None + linear_15 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_8 = linear_15.view((1, 3, -1, 128)) + linear_15 = None + key_states_2 = view_8.transpose(1, 2) + view_8 = None + linear_16 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_18 = l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_9 = linear_16.view((1, 3, -1, 128)) + linear_16 = None + value_states_2 = view_9.transpose(1, 2) + view_9 = None + cos_5 = cos_2.unsqueeze(1) + sin_5 = sin_2.unsqueeze(1) + mul_23 = query_states_2 * cos_5 + x1_4 = query_states_2[(Ellipsis, slice(None, 64, None))] + x2_4 = query_states_2[(Ellipsis, slice(64, None, None))] + query_states_2 = None + neg_4 = -x2_4 + x2_4 = None + cat_5 = torch.cat((neg_4, x1_4), dim=-1) + neg_4 = x1_4 = None + mul_24 = cat_5 * sin_5 + cat_5 = None + q_embed_2 = mul_23 + mul_24 + mul_23 = mul_24 = None + mul_25 = key_states_2 * cos_5 + cos_5 = None + x1_5 = key_states_2[(Ellipsis, slice(None, 64, None))] + x2_5 = key_states_2[(Ellipsis, slice(64, None, None))] + key_states_2 = None + neg_5 = -x2_5 + x2_5 = None + cat_6 = torch.cat((neg_5, x1_5), dim=-1) + neg_5 = x1_5 = None + mul_26 = cat_6 * sin_5 + cat_6 = sin_5 = None + k_embed_2 = mul_25 + mul_26 + mul_25 = mul_26 = None + attention_mask_3 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_2 = q_embed_2.contiguous() + q_embed_2 = None + key_2 = k_embed_2.contiguous() + value_2 = value_states_2.contiguous() + attn_output_8 = torch._C._nn.scaled_dot_product_attention( + query_2, + key_2, + value_2, + attn_mask=attention_mask_3, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_2 = key_2 = value_2 = attention_mask_3 = None + transpose_12 = attn_output_8.transpose(1, 2) + attn_output_8 = None + attn_output_9 = transpose_12.contiguous() + transpose_12 = None + reshape_2 = attn_output_9.reshape(1, 3, -1) + attn_output_9 = None + attn_output_10 = reshape_2.contiguous() + reshape_2 = None + attn_output_11 = torch._C._nn.linear( + attn_output_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_10 = l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_19 = hidden_states_15 + attn_output_11 + hidden_states_15 = attn_output_11 = None + hidden_states_20 = hidden_states_19.to(torch.float32) + pow_6 = hidden_states_20.pow(2) + variance_5 = pow_6.mean(-1, keepdim=True) + pow_6 = None + add_16 = variance_5 + 1e-06 + variance_5 = None + rsqrt_5 = torch.rsqrt(add_16) + add_16 = None + hidden_states_21 = hidden_states_20 * rsqrt_5 + hidden_states_20 = rsqrt_5 = None + to_15 = hidden_states_21.to(torch.bfloat16) + hidden_states_21 = None + hidden_states_22 = ( + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + * to_15 + ) + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = ( + to_15 + ) = None + linear_18 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_2 = torch.nn.functional.silu(linear_18, inplace=False) + linear_18 = None + linear_19 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_22 = l_self_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_29 = silu_2 * linear_19 + silu_2 = linear_19 = None + down_proj_2 = torch._C._nn.linear( + mul_29, + l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_29 = l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_23 = hidden_states_19 + down_proj_2 + hidden_states_19 = down_proj_2 = None + hidden_states_24 = hidden_states_23.to(torch.float32) + pow_7 = hidden_states_24.pow(2) + variance_6 = pow_7.mean(-1, keepdim=True) + pow_7 = None + add_18 = variance_6 + 1e-06 + variance_6 = None + rsqrt_6 = torch.rsqrt(add_18) + add_18 = None + hidden_states_25 = hidden_states_24 * rsqrt_6 + hidden_states_24 = rsqrt_6 = None + to_17 = hidden_states_25.to(torch.bfloat16) + hidden_states_25 = None + hidden_states_26 = ( + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + * to_17 + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + to_17 + ) = None + linear_21 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_10 = linear_21.view((1, 3, -1, 128)) + linear_21 = None + query_states_3 = view_10.transpose(1, 2) + view_10 = None + linear_22 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_11 = linear_22.view((1, 3, -1, 128)) + linear_22 = None + key_states_3 = view_11.transpose(1, 2) + view_11 = None + linear_23 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_26 = l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_12 = linear_23.view((1, 3, -1, 128)) + linear_23 = None + value_states_3 = view_12.transpose(1, 2) + view_12 = None + cos_6 = cos_2.unsqueeze(1) + sin_6 = sin_2.unsqueeze(1) + mul_32 = query_states_3 * cos_6 + x1_6 = query_states_3[(Ellipsis, slice(None, 64, None))] + x2_6 = query_states_3[(Ellipsis, slice(64, None, None))] + query_states_3 = None + neg_6 = -x2_6 + x2_6 = None + cat_7 = torch.cat((neg_6, x1_6), dim=-1) + neg_6 = x1_6 = None + mul_33 = cat_7 * sin_6 + cat_7 = None + q_embed_3 = mul_32 + mul_33 + mul_32 = mul_33 = None + mul_34 = key_states_3 * cos_6 + cos_6 = None + x1_7 = key_states_3[(Ellipsis, slice(None, 64, None))] + x2_7 = key_states_3[(Ellipsis, slice(64, None, None))] + key_states_3 = None + neg_7 = -x2_7 + x2_7 = None + cat_8 = torch.cat((neg_7, x1_7), dim=-1) + neg_7 = x1_7 = None + mul_35 = cat_8 * sin_6 + cat_8 = sin_6 = None + k_embed_3 = mul_34 + mul_35 + mul_34 = mul_35 = None + attention_mask_4 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_3 = q_embed_3.contiguous() + q_embed_3 = None + key_3 = k_embed_3.contiguous() + value_3 = value_states_3.contiguous() + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_3, + key_3, + value_3, + attn_mask=attention_mask_4, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_3 = key_3 = value_3 = attention_mask_4 = None + transpose_16 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_13 = transpose_16.contiguous() + transpose_16 = None + reshape_3 = attn_output_13.reshape(1, 3, -1) + attn_output_13 = None + attn_output_14 = reshape_3.contiguous() + reshape_3 = None + attn_output_15 = torch._C._nn.linear( + attn_output_14, + l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_14 = l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_27 = hidden_states_23 + attn_output_15 + hidden_states_23 = attn_output_15 = None + hidden_states_28 = hidden_states_27.to(torch.float32) + pow_8 = hidden_states_28.pow(2) + variance_7 = pow_8.mean(-1, keepdim=True) + pow_8 = None + add_22 = variance_7 + 1e-06 + variance_7 = None + rsqrt_7 = torch.rsqrt(add_22) + add_22 = None + hidden_states_29 = hidden_states_28 * rsqrt_7 + hidden_states_28 = rsqrt_7 = None + to_19 = hidden_states_29.to(torch.bfloat16) + hidden_states_29 = None + hidden_states_30 = ( + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + * to_19 + ) + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = ( + to_19 + ) = None + linear_25 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_3 = torch.nn.functional.silu(linear_25, inplace=False) + linear_25 = None + linear_26 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_30 = l_self_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_38 = silu_3 * linear_26 + silu_3 = linear_26 = None + down_proj_3 = torch._C._nn.linear( + mul_38, + l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_38 = l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_31 = hidden_states_27 + down_proj_3 + hidden_states_27 = down_proj_3 = None + hidden_states_32 = hidden_states_31.to(torch.float32) + pow_9 = hidden_states_32.pow(2) + variance_8 = pow_9.mean(-1, keepdim=True) + pow_9 = None + add_24 = variance_8 + 1e-06 + variance_8 = None + rsqrt_8 = torch.rsqrt(add_24) + add_24 = None + hidden_states_33 = hidden_states_32 * rsqrt_8 + hidden_states_32 = rsqrt_8 = None + to_21 = hidden_states_33.to(torch.bfloat16) + hidden_states_33 = None + hidden_states_34 = ( + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + * to_21 + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + to_21 + ) = None + linear_28 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_13 = linear_28.view((1, 3, -1, 128)) + linear_28 = None + query_states_4 = view_13.transpose(1, 2) + view_13 = None + linear_29 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_14 = linear_29.view((1, 3, -1, 128)) + linear_29 = None + key_states_4 = view_14.transpose(1, 2) + view_14 = None + linear_30 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_34 = l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_15 = linear_30.view((1, 3, -1, 128)) + linear_30 = None + value_states_4 = view_15.transpose(1, 2) + view_15 = None + cos_7 = cos_2.unsqueeze(1) + sin_7 = sin_2.unsqueeze(1) + mul_41 = query_states_4 * cos_7 + x1_8 = query_states_4[(Ellipsis, slice(None, 64, None))] + x2_8 = query_states_4[(Ellipsis, slice(64, None, None))] + query_states_4 = None + neg_8 = -x2_8 + x2_8 = None + cat_9 = torch.cat((neg_8, x1_8), dim=-1) + neg_8 = x1_8 = None + mul_42 = cat_9 * sin_7 + cat_9 = None + q_embed_4 = mul_41 + mul_42 + mul_41 = mul_42 = None + mul_43 = key_states_4 * cos_7 + cos_7 = None + x1_9 = key_states_4[(Ellipsis, slice(None, 64, None))] + x2_9 = key_states_4[(Ellipsis, slice(64, None, None))] + key_states_4 = None + neg_9 = -x2_9 + x2_9 = None + cat_10 = torch.cat((neg_9, x1_9), dim=-1) + neg_9 = x1_9 = None + mul_44 = cat_10 * sin_7 + cat_10 = sin_7 = None + k_embed_4 = mul_43 + mul_44 + mul_43 = mul_44 = None + attention_mask_5 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_4 = q_embed_4.contiguous() + q_embed_4 = None + key_4 = k_embed_4.contiguous() + value_4 = value_states_4.contiguous() + attn_output_16 = torch._C._nn.scaled_dot_product_attention( + query_4, + key_4, + value_4, + attn_mask=attention_mask_5, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_4 = key_4 = value_4 = attention_mask_5 = None + transpose_20 = attn_output_16.transpose(1, 2) + attn_output_16 = None + attn_output_17 = transpose_20.contiguous() + transpose_20 = None + reshape_4 = attn_output_17.reshape(1, 3, -1) + attn_output_17 = None + attn_output_18 = reshape_4.contiguous() + reshape_4 = None + attn_output_19 = torch._C._nn.linear( + attn_output_18, + l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_18 = l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_35 = hidden_states_31 + attn_output_19 + hidden_states_31 = attn_output_19 = None + hidden_states_36 = hidden_states_35.to(torch.float32) + pow_10 = hidden_states_36.pow(2) + variance_9 = pow_10.mean(-1, keepdim=True) + pow_10 = None + add_28 = variance_9 + 1e-06 + variance_9 = None + rsqrt_9 = torch.rsqrt(add_28) + add_28 = None + hidden_states_37 = hidden_states_36 * rsqrt_9 + hidden_states_36 = rsqrt_9 = None + to_23 = hidden_states_37.to(torch.bfloat16) + hidden_states_37 = None + hidden_states_38 = ( + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + * to_23 + ) + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = ( + to_23 + ) = None + linear_32 = torch._C._nn.linear( + hidden_states_38, + l_self_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_4 = torch.nn.functional.silu(linear_32, inplace=False) + linear_32 = None + linear_33 = torch._C._nn.linear( + hidden_states_38, + l_self_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_38 = l_self_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_47 = silu_4 * linear_33 + silu_4 = linear_33 = None + down_proj_4 = torch._C._nn.linear( + mul_47, + l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_47 = l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_39 = hidden_states_35 + down_proj_4 + hidden_states_35 = down_proj_4 = None + hidden_states_40 = hidden_states_39.to(torch.float32) + pow_11 = hidden_states_40.pow(2) + variance_10 = pow_11.mean(-1, keepdim=True) + pow_11 = None + add_30 = variance_10 + 1e-06 + variance_10 = None + rsqrt_10 = torch.rsqrt(add_30) + add_30 = None + hidden_states_41 = hidden_states_40 * rsqrt_10 + hidden_states_40 = rsqrt_10 = None + to_25 = hidden_states_41.to(torch.bfloat16) + hidden_states_41 = None + hidden_states_42 = ( + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + * to_25 + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + to_25 + ) = None + linear_35 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_16 = linear_35.view((1, 3, -1, 128)) + linear_35 = None + query_states_5 = view_16.transpose(1, 2) + view_16 = None + linear_36 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_17 = linear_36.view((1, 3, -1, 128)) + linear_36 = None + key_states_5 = view_17.transpose(1, 2) + view_17 = None + linear_37 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_42 = l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_18 = linear_37.view((1, 3, -1, 128)) + linear_37 = None + value_states_5 = view_18.transpose(1, 2) + view_18 = None + cos_8 = cos_2.unsqueeze(1) + sin_8 = sin_2.unsqueeze(1) + mul_50 = query_states_5 * cos_8 + x1_10 = query_states_5[(Ellipsis, slice(None, 64, None))] + x2_10 = query_states_5[(Ellipsis, slice(64, None, None))] + query_states_5 = None + neg_10 = -x2_10 + x2_10 = None + cat_11 = torch.cat((neg_10, x1_10), dim=-1) + neg_10 = x1_10 = None + mul_51 = cat_11 * sin_8 + cat_11 = None + q_embed_5 = mul_50 + mul_51 + mul_50 = mul_51 = None + mul_52 = key_states_5 * cos_8 + cos_8 = None + x1_11 = key_states_5[(Ellipsis, slice(None, 64, None))] + x2_11 = key_states_5[(Ellipsis, slice(64, None, None))] + key_states_5 = None + neg_11 = -x2_11 + x2_11 = None + cat_12 = torch.cat((neg_11, x1_11), dim=-1) + neg_11 = x1_11 = None + mul_53 = cat_12 * sin_8 + cat_12 = sin_8 = None + k_embed_5 = mul_52 + mul_53 + mul_52 = mul_53 = None + attention_mask_6 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_5 = q_embed_5.contiguous() + q_embed_5 = None + key_5 = k_embed_5.contiguous() + value_5 = value_states_5.contiguous() + attn_output_20 = torch._C._nn.scaled_dot_product_attention( + query_5, + key_5, + value_5, + attn_mask=attention_mask_6, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_5 = key_5 = value_5 = attention_mask_6 = None + transpose_24 = attn_output_20.transpose(1, 2) + attn_output_20 = None + attn_output_21 = transpose_24.contiguous() + transpose_24 = None + reshape_5 = attn_output_21.reshape(1, 3, -1) + attn_output_21 = None + attn_output_22 = reshape_5.contiguous() + reshape_5 = None + attn_output_23 = torch._C._nn.linear( + attn_output_22, + l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_22 = l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_43 = hidden_states_39 + attn_output_23 + hidden_states_39 = attn_output_23 = None + hidden_states_44 = hidden_states_43.to(torch.float32) + pow_12 = hidden_states_44.pow(2) + variance_11 = pow_12.mean(-1, keepdim=True) + pow_12 = None + add_34 = variance_11 + 1e-06 + variance_11 = None + rsqrt_11 = torch.rsqrt(add_34) + add_34 = None + hidden_states_45 = hidden_states_44 * rsqrt_11 + hidden_states_44 = rsqrt_11 = None + to_27 = hidden_states_45.to(torch.bfloat16) + hidden_states_45 = None + hidden_states_46 = ( + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + * to_27 + ) + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = ( + to_27 + ) = None + linear_39 = torch._C._nn.linear( + hidden_states_46, + l_self_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_5 = torch.nn.functional.silu(linear_39, inplace=False) + linear_39 = None + linear_40 = torch._C._nn.linear( + hidden_states_46, + l_self_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_46 = l_self_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_56 = silu_5 * linear_40 + silu_5 = linear_40 = None + down_proj_5 = torch._C._nn.linear( + mul_56, + l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_56 = l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_47 = hidden_states_43 + down_proj_5 + hidden_states_43 = down_proj_5 = None + hidden_states_48 = hidden_states_47.to(torch.float32) + pow_13 = hidden_states_48.pow(2) + variance_12 = pow_13.mean(-1, keepdim=True) + pow_13 = None + add_36 = variance_12 + 1e-06 + variance_12 = None + rsqrt_12 = torch.rsqrt(add_36) + add_36 = None + hidden_states_49 = hidden_states_48 * rsqrt_12 + hidden_states_48 = rsqrt_12 = None + to_29 = hidden_states_49.to(torch.bfloat16) + hidden_states_49 = None + hidden_states_50 = ( + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + * to_29 + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + to_29 + ) = None + linear_42 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_19 = linear_42.view((1, 3, -1, 128)) + linear_42 = None + query_states_6 = view_19.transpose(1, 2) + view_19 = None + linear_43 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_20 = linear_43.view((1, 3, -1, 128)) + linear_43 = None + key_states_6 = view_20.transpose(1, 2) + view_20 = None + linear_44 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_50 = l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_21 = linear_44.view((1, 3, -1, 128)) + linear_44 = None + value_states_6 = view_21.transpose(1, 2) + view_21 = None + cos_9 = cos_2.unsqueeze(1) + sin_9 = sin_2.unsqueeze(1) + mul_59 = query_states_6 * cos_9 + x1_12 = query_states_6[(Ellipsis, slice(None, 64, None))] + x2_12 = query_states_6[(Ellipsis, slice(64, None, None))] + query_states_6 = None + neg_12 = -x2_12 + x2_12 = None + cat_13 = torch.cat((neg_12, x1_12), dim=-1) + neg_12 = x1_12 = None + mul_60 = cat_13 * sin_9 + cat_13 = None + q_embed_6 = mul_59 + mul_60 + mul_59 = mul_60 = None + mul_61 = key_states_6 * cos_9 + cos_9 = None + x1_13 = key_states_6[(Ellipsis, slice(None, 64, None))] + x2_13 = key_states_6[(Ellipsis, slice(64, None, None))] + key_states_6 = None + neg_13 = -x2_13 + x2_13 = None + cat_14 = torch.cat((neg_13, x1_13), dim=-1) + neg_13 = x1_13 = None + mul_62 = cat_14 * sin_9 + cat_14 = sin_9 = None + k_embed_6 = mul_61 + mul_62 + mul_61 = mul_62 = None + attention_mask_7 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_6 = q_embed_6.contiguous() + q_embed_6 = None + key_6 = k_embed_6.contiguous() + value_6 = value_states_6.contiguous() + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + query_6, + key_6, + value_6, + attn_mask=attention_mask_7, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_6 = key_6 = value_6 = attention_mask_7 = None + transpose_28 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_25 = transpose_28.contiguous() + transpose_28 = None + reshape_6 = attn_output_25.reshape(1, 3, -1) + attn_output_25 = None + attn_output_26 = reshape_6.contiguous() + reshape_6 = None + attn_output_27 = torch._C._nn.linear( + attn_output_26, + l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_26 = l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_51 = hidden_states_47 + attn_output_27 + hidden_states_47 = attn_output_27 = None + hidden_states_52 = hidden_states_51.to(torch.float32) + pow_14 = hidden_states_52.pow(2) + variance_13 = pow_14.mean(-1, keepdim=True) + pow_14 = None + add_40 = variance_13 + 1e-06 + variance_13 = None + rsqrt_13 = torch.rsqrt(add_40) + add_40 = None + hidden_states_53 = hidden_states_52 * rsqrt_13 + hidden_states_52 = rsqrt_13 = None + to_31 = hidden_states_53.to(torch.bfloat16) + hidden_states_53 = None + hidden_states_54 = ( + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + * to_31 + ) + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = ( + to_31 + ) = None + linear_46 = torch._C._nn.linear( + hidden_states_54, + l_self_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_6 = torch.nn.functional.silu(linear_46, inplace=False) + linear_46 = None + linear_47 = torch._C._nn.linear( + hidden_states_54, + l_self_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_54 = l_self_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_65 = silu_6 * linear_47 + silu_6 = linear_47 = None + down_proj_6 = torch._C._nn.linear( + mul_65, + l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_65 = l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_55 = hidden_states_51 + down_proj_6 + hidden_states_51 = down_proj_6 = None + hidden_states_56 = hidden_states_55.to(torch.float32) + pow_15 = hidden_states_56.pow(2) + variance_14 = pow_15.mean(-1, keepdim=True) + pow_15 = None + add_42 = variance_14 + 1e-06 + variance_14 = None + rsqrt_14 = torch.rsqrt(add_42) + add_42 = None + hidden_states_57 = hidden_states_56 * rsqrt_14 + hidden_states_56 = rsqrt_14 = None + to_33 = hidden_states_57.to(torch.bfloat16) + hidden_states_57 = None + hidden_states_58 = ( + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + * to_33 + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + to_33 + ) = None + linear_49 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_22 = linear_49.view((1, 3, -1, 128)) + linear_49 = None + query_states_7 = view_22.transpose(1, 2) + view_22 = None + linear_50 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_23 = linear_50.view((1, 3, -1, 128)) + linear_50 = None + key_states_7 = view_23.transpose(1, 2) + view_23 = None + linear_51 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_58 = l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_24 = linear_51.view((1, 3, -1, 128)) + linear_51 = None + value_states_7 = view_24.transpose(1, 2) + view_24 = None + cos_10 = cos_2.unsqueeze(1) + sin_10 = sin_2.unsqueeze(1) + mul_68 = query_states_7 * cos_10 + x1_14 = query_states_7[(Ellipsis, slice(None, 64, None))] + x2_14 = query_states_7[(Ellipsis, slice(64, None, None))] + query_states_7 = None + neg_14 = -x2_14 + x2_14 = None + cat_15 = torch.cat((neg_14, x1_14), dim=-1) + neg_14 = x1_14 = None + mul_69 = cat_15 * sin_10 + cat_15 = None + q_embed_7 = mul_68 + mul_69 + mul_68 = mul_69 = None + mul_70 = key_states_7 * cos_10 + cos_10 = None + x1_15 = key_states_7[(Ellipsis, slice(None, 64, None))] + x2_15 = key_states_7[(Ellipsis, slice(64, None, None))] + key_states_7 = None + neg_15 = -x2_15 + x2_15 = None + cat_16 = torch.cat((neg_15, x1_15), dim=-1) + neg_15 = x1_15 = None + mul_71 = cat_16 * sin_10 + cat_16 = sin_10 = None + k_embed_7 = mul_70 + mul_71 + mul_70 = mul_71 = None + attention_mask_8 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_7 = q_embed_7.contiguous() + q_embed_7 = None + key_7 = k_embed_7.contiguous() + value_7 = value_states_7.contiguous() + attn_output_28 = torch._C._nn.scaled_dot_product_attention( + query_7, + key_7, + value_7, + attn_mask=attention_mask_8, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_7 = key_7 = value_7 = attention_mask_8 = None + transpose_32 = attn_output_28.transpose(1, 2) + attn_output_28 = None + attn_output_29 = transpose_32.contiguous() + transpose_32 = None + reshape_7 = attn_output_29.reshape(1, 3, -1) + attn_output_29 = None + attn_output_30 = reshape_7.contiguous() + reshape_7 = None + attn_output_31 = torch._C._nn.linear( + attn_output_30, + l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_30 = l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_59 = hidden_states_55 + attn_output_31 + hidden_states_55 = attn_output_31 = None + hidden_states_60 = hidden_states_59.to(torch.float32) + pow_16 = hidden_states_60.pow(2) + variance_15 = pow_16.mean(-1, keepdim=True) + pow_16 = None + add_46 = variance_15 + 1e-06 + variance_15 = None + rsqrt_15 = torch.rsqrt(add_46) + add_46 = None + hidden_states_61 = hidden_states_60 * rsqrt_15 + hidden_states_60 = rsqrt_15 = None + to_35 = hidden_states_61.to(torch.bfloat16) + hidden_states_61 = None + hidden_states_62 = ( + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + * to_35 + ) + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = ( + to_35 + ) = None + linear_53 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_7 = torch.nn.functional.silu(linear_53, inplace=False) + linear_53 = None + linear_54 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_62 = l_self_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_74 = silu_7 * linear_54 + silu_7 = linear_54 = None + down_proj_7 = torch._C._nn.linear( + mul_74, + l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_74 = l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_63 = hidden_states_59 + down_proj_7 + hidden_states_59 = down_proj_7 = None + hidden_states_64 = hidden_states_63.to(torch.float32) + pow_17 = hidden_states_64.pow(2) + variance_16 = pow_17.mean(-1, keepdim=True) + pow_17 = None + add_48 = variance_16 + 1e-06 + variance_16 = None + rsqrt_16 = torch.rsqrt(add_48) + add_48 = None + hidden_states_65 = hidden_states_64 * rsqrt_16 + hidden_states_64 = rsqrt_16 = None + to_37 = hidden_states_65.to(torch.bfloat16) + hidden_states_65 = None + hidden_states_66 = ( + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + * to_37 + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + to_37 + ) = None + linear_56 = torch._C._nn.linear( + hidden_states_66, + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_25 = linear_56.view((1, 3, -1, 128)) + linear_56 = None + query_states_8 = view_25.transpose(1, 2) + view_25 = None + linear_57 = torch._C._nn.linear( + hidden_states_66, + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_26 = linear_57.view((1, 3, -1, 128)) + linear_57 = None + key_states_8 = view_26.transpose(1, 2) + view_26 = None + linear_58 = torch._C._nn.linear( + hidden_states_66, + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_66 = l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_27 = linear_58.view((1, 3, -1, 128)) + linear_58 = None + value_states_8 = view_27.transpose(1, 2) + view_27 = None + cos_11 = cos_2.unsqueeze(1) + sin_11 = sin_2.unsqueeze(1) + mul_77 = query_states_8 * cos_11 + x1_16 = query_states_8[(Ellipsis, slice(None, 64, None))] + x2_16 = query_states_8[(Ellipsis, slice(64, None, None))] + query_states_8 = None + neg_16 = -x2_16 + x2_16 = None + cat_17 = torch.cat((neg_16, x1_16), dim=-1) + neg_16 = x1_16 = None + mul_78 = cat_17 * sin_11 + cat_17 = None + q_embed_8 = mul_77 + mul_78 + mul_77 = mul_78 = None + mul_79 = key_states_8 * cos_11 + cos_11 = None + x1_17 = key_states_8[(Ellipsis, slice(None, 64, None))] + x2_17 = key_states_8[(Ellipsis, slice(64, None, None))] + key_states_8 = None + neg_17 = -x2_17 + x2_17 = None + cat_18 = torch.cat((neg_17, x1_17), dim=-1) + neg_17 = x1_17 = None + mul_80 = cat_18 * sin_11 + cat_18 = sin_11 = None + k_embed_8 = mul_79 + mul_80 + mul_79 = mul_80 = None + attention_mask_9 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_8 = q_embed_8.contiguous() + q_embed_8 = None + key_8 = k_embed_8.contiguous() + value_8 = value_states_8.contiguous() + attn_output_32 = torch._C._nn.scaled_dot_product_attention( + query_8, + key_8, + value_8, + attn_mask=attention_mask_9, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_8 = key_8 = value_8 = attention_mask_9 = None + transpose_36 = attn_output_32.transpose(1, 2) + attn_output_32 = None + attn_output_33 = transpose_36.contiguous() + transpose_36 = None + reshape_8 = attn_output_33.reshape(1, 3, -1) + attn_output_33 = None + attn_output_34 = reshape_8.contiguous() + reshape_8 = None + attn_output_35 = torch._C._nn.linear( + attn_output_34, + l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_34 = l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_67 = hidden_states_63 + attn_output_35 + hidden_states_63 = attn_output_35 = None + hidden_states_68 = hidden_states_67.to(torch.float32) + pow_18 = hidden_states_68.pow(2) + variance_17 = pow_18.mean(-1, keepdim=True) + pow_18 = None + add_52 = variance_17 + 1e-06 + variance_17 = None + rsqrt_17 = torch.rsqrt(add_52) + add_52 = None + hidden_states_69 = hidden_states_68 * rsqrt_17 + hidden_states_68 = rsqrt_17 = None + to_39 = hidden_states_69.to(torch.bfloat16) + hidden_states_69 = None + hidden_states_70 = ( + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + * to_39 + ) + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = ( + to_39 + ) = None + linear_60 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_8 = torch.nn.functional.silu(linear_60, inplace=False) + linear_60 = None + linear_61 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_70 = l_self_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_83 = silu_8 * linear_61 + silu_8 = linear_61 = None + down_proj_8 = torch._C._nn.linear( + mul_83, + l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_83 = l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_71 = hidden_states_67 + down_proj_8 + hidden_states_67 = down_proj_8 = None + hidden_states_72 = hidden_states_71.to(torch.float32) + pow_19 = hidden_states_72.pow(2) + variance_18 = pow_19.mean(-1, keepdim=True) + pow_19 = None + add_54 = variance_18 + 1e-06 + variance_18 = None + rsqrt_18 = torch.rsqrt(add_54) + add_54 = None + hidden_states_73 = hidden_states_72 * rsqrt_18 + hidden_states_72 = rsqrt_18 = None + to_41 = hidden_states_73.to(torch.bfloat16) + hidden_states_73 = None + hidden_states_74 = ( + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + * to_41 + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + to_41 + ) = None + linear_63 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_28 = linear_63.view((1, 3, -1, 128)) + linear_63 = None + query_states_9 = view_28.transpose(1, 2) + view_28 = None + linear_64 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_29 = linear_64.view((1, 3, -1, 128)) + linear_64 = None + key_states_9 = view_29.transpose(1, 2) + view_29 = None + linear_65 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_74 = l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_30 = linear_65.view((1, 3, -1, 128)) + linear_65 = None + value_states_9 = view_30.transpose(1, 2) + view_30 = None + cos_12 = cos_2.unsqueeze(1) + sin_12 = sin_2.unsqueeze(1) + mul_86 = query_states_9 * cos_12 + x1_18 = query_states_9[(Ellipsis, slice(None, 64, None))] + x2_18 = query_states_9[(Ellipsis, slice(64, None, None))] + query_states_9 = None + neg_18 = -x2_18 + x2_18 = None + cat_19 = torch.cat((neg_18, x1_18), dim=-1) + neg_18 = x1_18 = None + mul_87 = cat_19 * sin_12 + cat_19 = None + q_embed_9 = mul_86 + mul_87 + mul_86 = mul_87 = None + mul_88 = key_states_9 * cos_12 + cos_12 = None + x1_19 = key_states_9[(Ellipsis, slice(None, 64, None))] + x2_19 = key_states_9[(Ellipsis, slice(64, None, None))] + key_states_9 = None + neg_19 = -x2_19 + x2_19 = None + cat_20 = torch.cat((neg_19, x1_19), dim=-1) + neg_19 = x1_19 = None + mul_89 = cat_20 * sin_12 + cat_20 = sin_12 = None + k_embed_9 = mul_88 + mul_89 + mul_88 = mul_89 = None + attention_mask_10 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_9 = q_embed_9.contiguous() + q_embed_9 = None + key_9 = k_embed_9.contiguous() + value_9 = value_states_9.contiguous() + attn_output_36 = torch._C._nn.scaled_dot_product_attention( + query_9, + key_9, + value_9, + attn_mask=attention_mask_10, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_9 = key_9 = value_9 = attention_mask_10 = None + transpose_40 = attn_output_36.transpose(1, 2) + attn_output_36 = None + attn_output_37 = transpose_40.contiguous() + transpose_40 = None + reshape_9 = attn_output_37.reshape(1, 3, -1) + attn_output_37 = None + attn_output_38 = reshape_9.contiguous() + reshape_9 = None + attn_output_39 = torch._C._nn.linear( + attn_output_38, + l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_38 = l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_75 = hidden_states_71 + attn_output_39 + hidden_states_71 = attn_output_39 = None + hidden_states_76 = hidden_states_75.to(torch.float32) + pow_20 = hidden_states_76.pow(2) + variance_19 = pow_20.mean(-1, keepdim=True) + pow_20 = None + add_58 = variance_19 + 1e-06 + variance_19 = None + rsqrt_19 = torch.rsqrt(add_58) + add_58 = None + hidden_states_77 = hidden_states_76 * rsqrt_19 + hidden_states_76 = rsqrt_19 = None + to_43 = hidden_states_77.to(torch.bfloat16) + hidden_states_77 = None + hidden_states_78 = ( + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + * to_43 + ) + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = ( + to_43 + ) = None + linear_67 = torch._C._nn.linear( + hidden_states_78, + l_self_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_9 = torch.nn.functional.silu(linear_67, inplace=False) + linear_67 = None + linear_68 = torch._C._nn.linear( + hidden_states_78, + l_self_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_78 = l_self_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_92 = silu_9 * linear_68 + silu_9 = linear_68 = None + down_proj_9 = torch._C._nn.linear( + mul_92, + l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_92 = l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_79 = hidden_states_75 + down_proj_9 + hidden_states_75 = down_proj_9 = None + hidden_states_80 = hidden_states_79.to(torch.float32) + pow_21 = hidden_states_80.pow(2) + variance_20 = pow_21.mean(-1, keepdim=True) + pow_21 = None + add_60 = variance_20 + 1e-06 + variance_20 = None + rsqrt_20 = torch.rsqrt(add_60) + add_60 = None + hidden_states_81 = hidden_states_80 * rsqrt_20 + hidden_states_80 = rsqrt_20 = None + to_45 = hidden_states_81.to(torch.bfloat16) + hidden_states_81 = None + hidden_states_82 = ( + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + * to_45 + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + to_45 + ) = None + linear_70 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_31 = linear_70.view((1, 3, -1, 128)) + linear_70 = None + query_states_10 = view_31.transpose(1, 2) + view_31 = None + linear_71 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_32 = linear_71.view((1, 3, -1, 128)) + linear_71 = None + key_states_10 = view_32.transpose(1, 2) + view_32 = None + linear_72 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_82 = l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_33 = linear_72.view((1, 3, -1, 128)) + linear_72 = None + value_states_10 = view_33.transpose(1, 2) + view_33 = None + cos_13 = cos_2.unsqueeze(1) + sin_13 = sin_2.unsqueeze(1) + mul_95 = query_states_10 * cos_13 + x1_20 = query_states_10[(Ellipsis, slice(None, 64, None))] + x2_20 = query_states_10[(Ellipsis, slice(64, None, None))] + query_states_10 = None + neg_20 = -x2_20 + x2_20 = None + cat_21 = torch.cat((neg_20, x1_20), dim=-1) + neg_20 = x1_20 = None + mul_96 = cat_21 * sin_13 + cat_21 = None + q_embed_10 = mul_95 + mul_96 + mul_95 = mul_96 = None + mul_97 = key_states_10 * cos_13 + cos_13 = None + x1_21 = key_states_10[(Ellipsis, slice(None, 64, None))] + x2_21 = key_states_10[(Ellipsis, slice(64, None, None))] + key_states_10 = None + neg_21 = -x2_21 + x2_21 = None + cat_22 = torch.cat((neg_21, x1_21), dim=-1) + neg_21 = x1_21 = None + mul_98 = cat_22 * sin_13 + cat_22 = sin_13 = None + k_embed_10 = mul_97 + mul_98 + mul_97 = mul_98 = None + attention_mask_11 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_10 = q_embed_10.contiguous() + q_embed_10 = None + key_10 = k_embed_10.contiguous() + value_10 = value_states_10.contiguous() + attn_output_40 = torch._C._nn.scaled_dot_product_attention( + query_10, + key_10, + value_10, + attn_mask=attention_mask_11, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_10 = key_10 = value_10 = attention_mask_11 = None + transpose_44 = attn_output_40.transpose(1, 2) + attn_output_40 = None + attn_output_41 = transpose_44.contiguous() + transpose_44 = None + reshape_10 = attn_output_41.reshape(1, 3, -1) + attn_output_41 = None + attn_output_42 = reshape_10.contiguous() + reshape_10 = None + attn_output_43 = torch._C._nn.linear( + attn_output_42, + l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_42 = l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_83 = hidden_states_79 + attn_output_43 + hidden_states_79 = attn_output_43 = None + hidden_states_84 = hidden_states_83.to(torch.float32) + pow_22 = hidden_states_84.pow(2) + variance_21 = pow_22.mean(-1, keepdim=True) + pow_22 = None + add_64 = variance_21 + 1e-06 + variance_21 = None + rsqrt_21 = torch.rsqrt(add_64) + add_64 = None + hidden_states_85 = hidden_states_84 * rsqrt_21 + hidden_states_84 = rsqrt_21 = None + to_47 = hidden_states_85.to(torch.bfloat16) + hidden_states_85 = None + hidden_states_86 = ( + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + * to_47 + ) + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = ( + to_47 + ) = None + linear_74 = torch._C._nn.linear( + hidden_states_86, + l_self_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_10 = torch.nn.functional.silu(linear_74, inplace=False) + linear_74 = None + linear_75 = torch._C._nn.linear( + hidden_states_86, + l_self_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_86 = l_self_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_101 = silu_10 * linear_75 + silu_10 = linear_75 = None + down_proj_10 = torch._C._nn.linear( + mul_101, + l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_101 = l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_87 = hidden_states_83 + down_proj_10 + hidden_states_83 = down_proj_10 = None + hidden_states_88 = hidden_states_87.to(torch.float32) + pow_23 = hidden_states_88.pow(2) + variance_22 = pow_23.mean(-1, keepdim=True) + pow_23 = None + add_66 = variance_22 + 1e-06 + variance_22 = None + rsqrt_22 = torch.rsqrt(add_66) + add_66 = None + hidden_states_89 = hidden_states_88 * rsqrt_22 + hidden_states_88 = rsqrt_22 = None + to_49 = hidden_states_89.to(torch.bfloat16) + hidden_states_89 = None + hidden_states_90 = ( + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + * to_49 + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + to_49 + ) = None + linear_77 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_34 = linear_77.view((1, 3, -1, 128)) + linear_77 = None + query_states_11 = view_34.transpose(1, 2) + view_34 = None + linear_78 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_35 = linear_78.view((1, 3, -1, 128)) + linear_78 = None + key_states_11 = view_35.transpose(1, 2) + view_35 = None + linear_79 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_90 = l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_36 = linear_79.view((1, 3, -1, 128)) + linear_79 = None + value_states_11 = view_36.transpose(1, 2) + view_36 = None + cos_14 = cos_2.unsqueeze(1) + sin_14 = sin_2.unsqueeze(1) + mul_104 = query_states_11 * cos_14 + x1_22 = query_states_11[(Ellipsis, slice(None, 64, None))] + x2_22 = query_states_11[(Ellipsis, slice(64, None, None))] + query_states_11 = None + neg_22 = -x2_22 + x2_22 = None + cat_23 = torch.cat((neg_22, x1_22), dim=-1) + neg_22 = x1_22 = None + mul_105 = cat_23 * sin_14 + cat_23 = None + q_embed_11 = mul_104 + mul_105 + mul_104 = mul_105 = None + mul_106 = key_states_11 * cos_14 + cos_14 = None + x1_23 = key_states_11[(Ellipsis, slice(None, 64, None))] + x2_23 = key_states_11[(Ellipsis, slice(64, None, None))] + key_states_11 = None + neg_23 = -x2_23 + x2_23 = None + cat_24 = torch.cat((neg_23, x1_23), dim=-1) + neg_23 = x1_23 = None + mul_107 = cat_24 * sin_14 + cat_24 = sin_14 = None + k_embed_11 = mul_106 + mul_107 + mul_106 = mul_107 = None + attention_mask_12 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_11 = q_embed_11.contiguous() + q_embed_11 = None + key_11 = k_embed_11.contiguous() + value_11 = value_states_11.contiguous() + attn_output_44 = torch._C._nn.scaled_dot_product_attention( + query_11, + key_11, + value_11, + attn_mask=attention_mask_12, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_11 = key_11 = value_11 = attention_mask_12 = None + transpose_48 = attn_output_44.transpose(1, 2) + attn_output_44 = None + attn_output_45 = transpose_48.contiguous() + transpose_48 = None + reshape_11 = attn_output_45.reshape(1, 3, -1) + attn_output_45 = None + attn_output_46 = reshape_11.contiguous() + reshape_11 = None + attn_output_47 = torch._C._nn.linear( + attn_output_46, + l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_46 = l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_91 = hidden_states_87 + attn_output_47 + hidden_states_87 = attn_output_47 = None + hidden_states_92 = hidden_states_91.to(torch.float32) + pow_24 = hidden_states_92.pow(2) + variance_23 = pow_24.mean(-1, keepdim=True) + pow_24 = None + add_70 = variance_23 + 1e-06 + variance_23 = None + rsqrt_23 = torch.rsqrt(add_70) + add_70 = None + hidden_states_93 = hidden_states_92 * rsqrt_23 + hidden_states_92 = rsqrt_23 = None + to_51 = hidden_states_93.to(torch.bfloat16) + hidden_states_93 = None + hidden_states_94 = ( + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + * to_51 + ) + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = ( + to_51 + ) = None + linear_81 = torch._C._nn.linear( + hidden_states_94, + l_self_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_11 = torch.nn.functional.silu(linear_81, inplace=False) + linear_81 = None + linear_82 = torch._C._nn.linear( + hidden_states_94, + l_self_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_94 = l_self_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_110 = silu_11 * linear_82 + silu_11 = linear_82 = None + down_proj_11 = torch._C._nn.linear( + mul_110, + l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_110 = l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_95 = hidden_states_91 + down_proj_11 + hidden_states_91 = down_proj_11 = None + hidden_states_96 = hidden_states_95.to(torch.float32) + pow_25 = hidden_states_96.pow(2) + variance_24 = pow_25.mean(-1, keepdim=True) + pow_25 = None + add_72 = variance_24 + 1e-06 + variance_24 = None + rsqrt_24 = torch.rsqrt(add_72) + add_72 = None + hidden_states_97 = hidden_states_96 * rsqrt_24 + hidden_states_96 = rsqrt_24 = None + to_53 = hidden_states_97.to(torch.bfloat16) + hidden_states_97 = None + hidden_states_98 = ( + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + * to_53 + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + to_53 + ) = None + linear_84 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_37 = linear_84.view((1, 3, -1, 128)) + linear_84 = None + query_states_12 = view_37.transpose(1, 2) + view_37 = None + linear_85 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_38 = linear_85.view((1, 3, -1, 128)) + linear_85 = None + key_states_12 = view_38.transpose(1, 2) + view_38 = None + linear_86 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_98 = l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_39 = linear_86.view((1, 3, -1, 128)) + linear_86 = None + value_states_12 = view_39.transpose(1, 2) + view_39 = None + cos_15 = cos_2.unsqueeze(1) + sin_15 = sin_2.unsqueeze(1) + mul_113 = query_states_12 * cos_15 + x1_24 = query_states_12[(Ellipsis, slice(None, 64, None))] + x2_24 = query_states_12[(Ellipsis, slice(64, None, None))] + query_states_12 = None + neg_24 = -x2_24 + x2_24 = None + cat_25 = torch.cat((neg_24, x1_24), dim=-1) + neg_24 = x1_24 = None + mul_114 = cat_25 * sin_15 + cat_25 = None + q_embed_12 = mul_113 + mul_114 + mul_113 = mul_114 = None + mul_115 = key_states_12 * cos_15 + cos_15 = None + x1_25 = key_states_12[(Ellipsis, slice(None, 64, None))] + x2_25 = key_states_12[(Ellipsis, slice(64, None, None))] + key_states_12 = None + neg_25 = -x2_25 + x2_25 = None + cat_26 = torch.cat((neg_25, x1_25), dim=-1) + neg_25 = x1_25 = None + mul_116 = cat_26 * sin_15 + cat_26 = sin_15 = None + k_embed_12 = mul_115 + mul_116 + mul_115 = mul_116 = None + attention_mask_13 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_12 = q_embed_12.contiguous() + q_embed_12 = None + key_12 = k_embed_12.contiguous() + value_12 = value_states_12.contiguous() + attn_output_48 = torch._C._nn.scaled_dot_product_attention( + query_12, + key_12, + value_12, + attn_mask=attention_mask_13, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_12 = key_12 = value_12 = attention_mask_13 = None + transpose_52 = attn_output_48.transpose(1, 2) + attn_output_48 = None + attn_output_49 = transpose_52.contiguous() + transpose_52 = None + reshape_12 = attn_output_49.reshape(1, 3, -1) + attn_output_49 = None + attn_output_50 = reshape_12.contiguous() + reshape_12 = None + attn_output_51 = torch._C._nn.linear( + attn_output_50, + l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_50 = l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_99 = hidden_states_95 + attn_output_51 + hidden_states_95 = attn_output_51 = None + hidden_states_100 = hidden_states_99.to(torch.float32) + pow_26 = hidden_states_100.pow(2) + variance_25 = pow_26.mean(-1, keepdim=True) + pow_26 = None + add_76 = variance_25 + 1e-06 + variance_25 = None + rsqrt_25 = torch.rsqrt(add_76) + add_76 = None + hidden_states_101 = hidden_states_100 * rsqrt_25 + hidden_states_100 = rsqrt_25 = None + to_55 = hidden_states_101.to(torch.bfloat16) + hidden_states_101 = None + hidden_states_102 = ( + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + * to_55 + ) + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = ( + to_55 + ) = None + linear_88 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_12 = torch.nn.functional.silu(linear_88, inplace=False) + linear_88 = None + linear_89 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_102 = l_self_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_119 = silu_12 * linear_89 + silu_12 = linear_89 = None + down_proj_12 = torch._C._nn.linear( + mul_119, + l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_119 = l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_103 = hidden_states_99 + down_proj_12 + hidden_states_99 = down_proj_12 = None + hidden_states_104 = hidden_states_103.to(torch.float32) + pow_27 = hidden_states_104.pow(2) + variance_26 = pow_27.mean(-1, keepdim=True) + pow_27 = None + add_78 = variance_26 + 1e-06 + variance_26 = None + rsqrt_26 = torch.rsqrt(add_78) + add_78 = None + hidden_states_105 = hidden_states_104 * rsqrt_26 + hidden_states_104 = rsqrt_26 = None + to_57 = hidden_states_105.to(torch.bfloat16) + hidden_states_105 = None + hidden_states_106 = ( + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + * to_57 + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + to_57 + ) = None + linear_91 = torch._C._nn.linear( + hidden_states_106, + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_40 = linear_91.view((1, 3, -1, 128)) + linear_91 = None + query_states_13 = view_40.transpose(1, 2) + view_40 = None + linear_92 = torch._C._nn.linear( + hidden_states_106, + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_41 = linear_92.view((1, 3, -1, 128)) + linear_92 = None + key_states_13 = view_41.transpose(1, 2) + view_41 = None + linear_93 = torch._C._nn.linear( + hidden_states_106, + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_106 = l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_42 = linear_93.view((1, 3, -1, 128)) + linear_93 = None + value_states_13 = view_42.transpose(1, 2) + view_42 = None + cos_16 = cos_2.unsqueeze(1) + sin_16 = sin_2.unsqueeze(1) + mul_122 = query_states_13 * cos_16 + x1_26 = query_states_13[(Ellipsis, slice(None, 64, None))] + x2_26 = query_states_13[(Ellipsis, slice(64, None, None))] + query_states_13 = None + neg_26 = -x2_26 + x2_26 = None + cat_27 = torch.cat((neg_26, x1_26), dim=-1) + neg_26 = x1_26 = None + mul_123 = cat_27 * sin_16 + cat_27 = None + q_embed_13 = mul_122 + mul_123 + mul_122 = mul_123 = None + mul_124 = key_states_13 * cos_16 + cos_16 = None + x1_27 = key_states_13[(Ellipsis, slice(None, 64, None))] + x2_27 = key_states_13[(Ellipsis, slice(64, None, None))] + key_states_13 = None + neg_27 = -x2_27 + x2_27 = None + cat_28 = torch.cat((neg_27, x1_27), dim=-1) + neg_27 = x1_27 = None + mul_125 = cat_28 * sin_16 + cat_28 = sin_16 = None + k_embed_13 = mul_124 + mul_125 + mul_124 = mul_125 = None + attention_mask_14 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_13 = q_embed_13.contiguous() + q_embed_13 = None + key_13 = k_embed_13.contiguous() + value_13 = value_states_13.contiguous() + attn_output_52 = torch._C._nn.scaled_dot_product_attention( + query_13, + key_13, + value_13, + attn_mask=attention_mask_14, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_13 = key_13 = value_13 = attention_mask_14 = None + transpose_56 = attn_output_52.transpose(1, 2) + attn_output_52 = None + attn_output_53 = transpose_56.contiguous() + transpose_56 = None + reshape_13 = attn_output_53.reshape(1, 3, -1) + attn_output_53 = None + attn_output_54 = reshape_13.contiguous() + reshape_13 = None + attn_output_55 = torch._C._nn.linear( + attn_output_54, + l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_54 = l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_107 = hidden_states_103 + attn_output_55 + hidden_states_103 = attn_output_55 = None + hidden_states_108 = hidden_states_107.to(torch.float32) + pow_28 = hidden_states_108.pow(2) + variance_27 = pow_28.mean(-1, keepdim=True) + pow_28 = None + add_82 = variance_27 + 1e-06 + variance_27 = None + rsqrt_27 = torch.rsqrt(add_82) + add_82 = None + hidden_states_109 = hidden_states_108 * rsqrt_27 + hidden_states_108 = rsqrt_27 = None + to_59 = hidden_states_109.to(torch.bfloat16) + hidden_states_109 = None + hidden_states_110 = ( + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + * to_59 + ) + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = ( + to_59 + ) = None + linear_95 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_13 = torch.nn.functional.silu(linear_95, inplace=False) + linear_95 = None + linear_96 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_110 = l_self_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_128 = silu_13 * linear_96 + silu_13 = linear_96 = None + down_proj_13 = torch._C._nn.linear( + mul_128, + l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_128 = l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_111 = hidden_states_107 + down_proj_13 + hidden_states_107 = down_proj_13 = None + hidden_states_112 = hidden_states_111.to(torch.float32) + pow_29 = hidden_states_112.pow(2) + variance_28 = pow_29.mean(-1, keepdim=True) + pow_29 = None + add_84 = variance_28 + 1e-06 + variance_28 = None + rsqrt_28 = torch.rsqrt(add_84) + add_84 = None + hidden_states_113 = hidden_states_112 * rsqrt_28 + hidden_states_112 = rsqrt_28 = None + to_61 = hidden_states_113.to(torch.bfloat16) + hidden_states_113 = None + hidden_states_114 = ( + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + * to_61 + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + to_61 + ) = None + linear_98 = torch._C._nn.linear( + hidden_states_114, + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_43 = linear_98.view((1, 3, -1, 128)) + linear_98 = None + query_states_14 = view_43.transpose(1, 2) + view_43 = None + linear_99 = torch._C._nn.linear( + hidden_states_114, + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_44 = linear_99.view((1, 3, -1, 128)) + linear_99 = None + key_states_14 = view_44.transpose(1, 2) + view_44 = None + linear_100 = torch._C._nn.linear( + hidden_states_114, + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_114 = l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_45 = linear_100.view((1, 3, -1, 128)) + linear_100 = None + value_states_14 = view_45.transpose(1, 2) + view_45 = None + cos_17 = cos_2.unsqueeze(1) + sin_17 = sin_2.unsqueeze(1) + mul_131 = query_states_14 * cos_17 + x1_28 = query_states_14[(Ellipsis, slice(None, 64, None))] + x2_28 = query_states_14[(Ellipsis, slice(64, None, None))] + query_states_14 = None + neg_28 = -x2_28 + x2_28 = None + cat_29 = torch.cat((neg_28, x1_28), dim=-1) + neg_28 = x1_28 = None + mul_132 = cat_29 * sin_17 + cat_29 = None + q_embed_14 = mul_131 + mul_132 + mul_131 = mul_132 = None + mul_133 = key_states_14 * cos_17 + cos_17 = None + x1_29 = key_states_14[(Ellipsis, slice(None, 64, None))] + x2_29 = key_states_14[(Ellipsis, slice(64, None, None))] + key_states_14 = None + neg_29 = -x2_29 + x2_29 = None + cat_30 = torch.cat((neg_29, x1_29), dim=-1) + neg_29 = x1_29 = None + mul_134 = cat_30 * sin_17 + cat_30 = sin_17 = None + k_embed_14 = mul_133 + mul_134 + mul_133 = mul_134 = None + attention_mask_15 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_14 = q_embed_14.contiguous() + q_embed_14 = None + key_14 = k_embed_14.contiguous() + value_14 = value_states_14.contiguous() + attn_output_56 = torch._C._nn.scaled_dot_product_attention( + query_14, + key_14, + value_14, + attn_mask=attention_mask_15, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_14 = key_14 = value_14 = attention_mask_15 = None + transpose_60 = attn_output_56.transpose(1, 2) + attn_output_56 = None + attn_output_57 = transpose_60.contiguous() + transpose_60 = None + reshape_14 = attn_output_57.reshape(1, 3, -1) + attn_output_57 = None + attn_output_58 = reshape_14.contiguous() + reshape_14 = None + attn_output_59 = torch._C._nn.linear( + attn_output_58, + l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_58 = l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_115 = hidden_states_111 + attn_output_59 + hidden_states_111 = attn_output_59 = None + hidden_states_116 = hidden_states_115.to(torch.float32) + pow_30 = hidden_states_116.pow(2) + variance_29 = pow_30.mean(-1, keepdim=True) + pow_30 = None + add_88 = variance_29 + 1e-06 + variance_29 = None + rsqrt_29 = torch.rsqrt(add_88) + add_88 = None + hidden_states_117 = hidden_states_116 * rsqrt_29 + hidden_states_116 = rsqrt_29 = None + to_63 = hidden_states_117.to(torch.bfloat16) + hidden_states_117 = None + hidden_states_118 = ( + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + * to_63 + ) + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = ( + to_63 + ) = None + linear_102 = torch._C._nn.linear( + hidden_states_118, + l_self_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_14 = torch.nn.functional.silu(linear_102, inplace=False) + linear_102 = None + linear_103 = torch._C._nn.linear( + hidden_states_118, + l_self_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_118 = l_self_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_137 = silu_14 * linear_103 + silu_14 = linear_103 = None + down_proj_14 = torch._C._nn.linear( + mul_137, + l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_137 = l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_119 = hidden_states_115 + down_proj_14 + hidden_states_115 = down_proj_14 = None + hidden_states_120 = hidden_states_119.to(torch.float32) + pow_31 = hidden_states_120.pow(2) + variance_30 = pow_31.mean(-1, keepdim=True) + pow_31 = None + add_90 = variance_30 + 1e-06 + variance_30 = None + rsqrt_30 = torch.rsqrt(add_90) + add_90 = None + hidden_states_121 = hidden_states_120 * rsqrt_30 + hidden_states_120 = rsqrt_30 = None + to_65 = hidden_states_121.to(torch.bfloat16) + hidden_states_121 = None + hidden_states_122 = ( + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + * to_65 + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + to_65 + ) = None + linear_105 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_46 = linear_105.view((1, 3, -1, 128)) + linear_105 = None + query_states_15 = view_46.transpose(1, 2) + view_46 = None + linear_106 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_47 = linear_106.view((1, 3, -1, 128)) + linear_106 = None + key_states_15 = view_47.transpose(1, 2) + view_47 = None + linear_107 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_122 = l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_48 = linear_107.view((1, 3, -1, 128)) + linear_107 = None + value_states_15 = view_48.transpose(1, 2) + view_48 = None + cos_18 = cos_2.unsqueeze(1) + sin_18 = sin_2.unsqueeze(1) + mul_140 = query_states_15 * cos_18 + x1_30 = query_states_15[(Ellipsis, slice(None, 64, None))] + x2_30 = query_states_15[(Ellipsis, slice(64, None, None))] + query_states_15 = None + neg_30 = -x2_30 + x2_30 = None + cat_31 = torch.cat((neg_30, x1_30), dim=-1) + neg_30 = x1_30 = None + mul_141 = cat_31 * sin_18 + cat_31 = None + q_embed_15 = mul_140 + mul_141 + mul_140 = mul_141 = None + mul_142 = key_states_15 * cos_18 + cos_18 = None + x1_31 = key_states_15[(Ellipsis, slice(None, 64, None))] + x2_31 = key_states_15[(Ellipsis, slice(64, None, None))] + key_states_15 = None + neg_31 = -x2_31 + x2_31 = None + cat_32 = torch.cat((neg_31, x1_31), dim=-1) + neg_31 = x1_31 = None + mul_143 = cat_32 * sin_18 + cat_32 = sin_18 = None + k_embed_15 = mul_142 + mul_143 + mul_142 = mul_143 = None + attention_mask_16 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_15 = q_embed_15.contiguous() + q_embed_15 = None + key_15 = k_embed_15.contiguous() + value_15 = value_states_15.contiguous() + attn_output_60 = torch._C._nn.scaled_dot_product_attention( + query_15, + key_15, + value_15, + attn_mask=attention_mask_16, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_15 = key_15 = value_15 = attention_mask_16 = None + transpose_64 = attn_output_60.transpose(1, 2) + attn_output_60 = None + attn_output_61 = transpose_64.contiguous() + transpose_64 = None + reshape_15 = attn_output_61.reshape(1, 3, -1) + attn_output_61 = None + attn_output_62 = reshape_15.contiguous() + reshape_15 = None + attn_output_63 = torch._C._nn.linear( + attn_output_62, + l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_62 = l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_123 = hidden_states_119 + attn_output_63 + hidden_states_119 = attn_output_63 = None + hidden_states_124 = hidden_states_123.to(torch.float32) + pow_32 = hidden_states_124.pow(2) + variance_31 = pow_32.mean(-1, keepdim=True) + pow_32 = None + add_94 = variance_31 + 1e-06 + variance_31 = None + rsqrt_31 = torch.rsqrt(add_94) + add_94 = None + hidden_states_125 = hidden_states_124 * rsqrt_31 + hidden_states_124 = rsqrt_31 = None + to_67 = hidden_states_125.to(torch.bfloat16) + hidden_states_125 = None + hidden_states_126 = ( + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + * to_67 + ) + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = ( + to_67 + ) = None + linear_109 = torch._C._nn.linear( + hidden_states_126, + l_self_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_15 = torch.nn.functional.silu(linear_109, inplace=False) + linear_109 = None + linear_110 = torch._C._nn.linear( + hidden_states_126, + l_self_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_126 = l_self_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_146 = silu_15 * linear_110 + silu_15 = linear_110 = None + down_proj_15 = torch._C._nn.linear( + mul_146, + l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_146 = l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_127 = hidden_states_123 + down_proj_15 + hidden_states_123 = down_proj_15 = None + hidden_states_128 = hidden_states_127.to(torch.float32) + pow_33 = hidden_states_128.pow(2) + variance_32 = pow_33.mean(-1, keepdim=True) + pow_33 = None + add_96 = variance_32 + 1e-06 + variance_32 = None + rsqrt_32 = torch.rsqrt(add_96) + add_96 = None + hidden_states_129 = hidden_states_128 * rsqrt_32 + hidden_states_128 = rsqrt_32 = None + to_69 = hidden_states_129.to(torch.bfloat16) + hidden_states_129 = None + hidden_states_130 = ( + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + * to_69 + ) + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + to_69 + ) = None + linear_112 = torch._C._nn.linear( + hidden_states_130, + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_49 = linear_112.view((1, 3, -1, 128)) + linear_112 = None + query_states_16 = view_49.transpose(1, 2) + view_49 = None + linear_113 = torch._C._nn.linear( + hidden_states_130, + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_50 = linear_113.view((1, 3, -1, 128)) + linear_113 = None + key_states_16 = view_50.transpose(1, 2) + view_50 = None + linear_114 = torch._C._nn.linear( + hidden_states_130, + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_130 = l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_51 = linear_114.view((1, 3, -1, 128)) + linear_114 = None + value_states_16 = view_51.transpose(1, 2) + view_51 = None + cos_19 = cos_2.unsqueeze(1) + sin_19 = sin_2.unsqueeze(1) + mul_149 = query_states_16 * cos_19 + x1_32 = query_states_16[(Ellipsis, slice(None, 64, None))] + x2_32 = query_states_16[(Ellipsis, slice(64, None, None))] + query_states_16 = None + neg_32 = -x2_32 + x2_32 = None + cat_33 = torch.cat((neg_32, x1_32), dim=-1) + neg_32 = x1_32 = None + mul_150 = cat_33 * sin_19 + cat_33 = None + q_embed_16 = mul_149 + mul_150 + mul_149 = mul_150 = None + mul_151 = key_states_16 * cos_19 + cos_19 = None + x1_33 = key_states_16[(Ellipsis, slice(None, 64, None))] + x2_33 = key_states_16[(Ellipsis, slice(64, None, None))] + key_states_16 = None + neg_33 = -x2_33 + x2_33 = None + cat_34 = torch.cat((neg_33, x1_33), dim=-1) + neg_33 = x1_33 = None + mul_152 = cat_34 * sin_19 + cat_34 = sin_19 = None + k_embed_16 = mul_151 + mul_152 + mul_151 = mul_152 = None + attention_mask_17 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_16 = q_embed_16.contiguous() + q_embed_16 = None + key_16 = k_embed_16.contiguous() + value_16 = value_states_16.contiguous() + attn_output_64 = torch._C._nn.scaled_dot_product_attention( + query_16, + key_16, + value_16, + attn_mask=attention_mask_17, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_16 = key_16 = value_16 = attention_mask_17 = None + transpose_68 = attn_output_64.transpose(1, 2) + attn_output_64 = None + attn_output_65 = transpose_68.contiguous() + transpose_68 = None + reshape_16 = attn_output_65.reshape(1, 3, -1) + attn_output_65 = None + attn_output_66 = reshape_16.contiguous() + reshape_16 = None + attn_output_67 = torch._C._nn.linear( + attn_output_66, + l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_66 = l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_131 = hidden_states_127 + attn_output_67 + hidden_states_127 = attn_output_67 = None + hidden_states_132 = hidden_states_131.to(torch.float32) + pow_34 = hidden_states_132.pow(2) + variance_33 = pow_34.mean(-1, keepdim=True) + pow_34 = None + add_100 = variance_33 + 1e-06 + variance_33 = None + rsqrt_33 = torch.rsqrt(add_100) + add_100 = None + hidden_states_133 = hidden_states_132 * rsqrt_33 + hidden_states_132 = rsqrt_33 = None + to_71 = hidden_states_133.to(torch.bfloat16) + hidden_states_133 = None + hidden_states_134 = ( + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + * to_71 + ) + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = ( + to_71 + ) = None + linear_116 = torch._C._nn.linear( + hidden_states_134, + l_self_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_16 = torch.nn.functional.silu(linear_116, inplace=False) + linear_116 = None + linear_117 = torch._C._nn.linear( + hidden_states_134, + l_self_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_134 = l_self_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_155 = silu_16 * linear_117 + silu_16 = linear_117 = None + down_proj_16 = torch._C._nn.linear( + mul_155, + l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_155 = l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_135 = hidden_states_131 + down_proj_16 + hidden_states_131 = down_proj_16 = None + hidden_states_136 = hidden_states_135.to(torch.float32) + pow_35 = hidden_states_136.pow(2) + variance_34 = pow_35.mean(-1, keepdim=True) + pow_35 = None + add_102 = variance_34 + 1e-06 + variance_34 = None + rsqrt_34 = torch.rsqrt(add_102) + add_102 = None + hidden_states_137 = hidden_states_136 * rsqrt_34 + hidden_states_136 = rsqrt_34 = None + to_73 = hidden_states_137.to(torch.bfloat16) + hidden_states_137 = None + hidden_states_138 = ( + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + * to_73 + ) + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + to_73 + ) = None + linear_119 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_52 = linear_119.view((1, 3, -1, 128)) + linear_119 = None + query_states_17 = view_52.transpose(1, 2) + view_52 = None + linear_120 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_53 = linear_120.view((1, 3, -1, 128)) + linear_120 = None + key_states_17 = view_53.transpose(1, 2) + view_53 = None + linear_121 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_138 = l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_54 = linear_121.view((1, 3, -1, 128)) + linear_121 = None + value_states_17 = view_54.transpose(1, 2) + view_54 = None + cos_20 = cos_2.unsqueeze(1) + sin_20 = sin_2.unsqueeze(1) + mul_158 = query_states_17 * cos_20 + x1_34 = query_states_17[(Ellipsis, slice(None, 64, None))] + x2_34 = query_states_17[(Ellipsis, slice(64, None, None))] + query_states_17 = None + neg_34 = -x2_34 + x2_34 = None + cat_35 = torch.cat((neg_34, x1_34), dim=-1) + neg_34 = x1_34 = None + mul_159 = cat_35 * sin_20 + cat_35 = None + q_embed_17 = mul_158 + mul_159 + mul_158 = mul_159 = None + mul_160 = key_states_17 * cos_20 + cos_20 = None + x1_35 = key_states_17[(Ellipsis, slice(None, 64, None))] + x2_35 = key_states_17[(Ellipsis, slice(64, None, None))] + key_states_17 = None + neg_35 = -x2_35 + x2_35 = None + cat_36 = torch.cat((neg_35, x1_35), dim=-1) + neg_35 = x1_35 = None + mul_161 = cat_36 * sin_20 + cat_36 = sin_20 = None + k_embed_17 = mul_160 + mul_161 + mul_160 = mul_161 = None + attention_mask_18 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_17 = q_embed_17.contiguous() + q_embed_17 = None + key_17 = k_embed_17.contiguous() + value_17 = value_states_17.contiguous() + attn_output_68 = torch._C._nn.scaled_dot_product_attention( + query_17, + key_17, + value_17, + attn_mask=attention_mask_18, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_17 = key_17 = value_17 = attention_mask_18 = None + transpose_72 = attn_output_68.transpose(1, 2) + attn_output_68 = None + attn_output_69 = transpose_72.contiguous() + transpose_72 = None + reshape_17 = attn_output_69.reshape(1, 3, -1) + attn_output_69 = None + attn_output_70 = reshape_17.contiguous() + reshape_17 = None + attn_output_71 = torch._C._nn.linear( + attn_output_70, + l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_70 = l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_139 = hidden_states_135 + attn_output_71 + hidden_states_135 = attn_output_71 = None + hidden_states_140 = hidden_states_139.to(torch.float32) + pow_36 = hidden_states_140.pow(2) + variance_35 = pow_36.mean(-1, keepdim=True) + pow_36 = None + add_106 = variance_35 + 1e-06 + variance_35 = None + rsqrt_35 = torch.rsqrt(add_106) + add_106 = None + hidden_states_141 = hidden_states_140 * rsqrt_35 + hidden_states_140 = rsqrt_35 = None + to_75 = hidden_states_141.to(torch.bfloat16) + hidden_states_141 = None + hidden_states_142 = ( + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + * to_75 + ) + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = ( + to_75 + ) = None + linear_123 = torch._C._nn.linear( + hidden_states_142, + l_self_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_17 = torch.nn.functional.silu(linear_123, inplace=False) + linear_123 = None + linear_124 = torch._C._nn.linear( + hidden_states_142, + l_self_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_142 = l_self_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_164 = silu_17 * linear_124 + silu_17 = linear_124 = None + down_proj_17 = torch._C._nn.linear( + mul_164, + l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_164 = l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_143 = hidden_states_139 + down_proj_17 + hidden_states_139 = down_proj_17 = None + hidden_states_144 = hidden_states_143.to(torch.float32) + pow_37 = hidden_states_144.pow(2) + variance_36 = pow_37.mean(-1, keepdim=True) + pow_37 = None + add_108 = variance_36 + 1e-06 + variance_36 = None + rsqrt_36 = torch.rsqrt(add_108) + add_108 = None + hidden_states_145 = hidden_states_144 * rsqrt_36 + hidden_states_144 = rsqrt_36 = None + to_77 = hidden_states_145.to(torch.bfloat16) + hidden_states_145 = None + hidden_states_146 = ( + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + * to_77 + ) + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + to_77 + ) = None + linear_126 = torch._C._nn.linear( + hidden_states_146, + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_55 = linear_126.view((1, 3, -1, 128)) + linear_126 = None + query_states_18 = view_55.transpose(1, 2) + view_55 = None + linear_127 = torch._C._nn.linear( + hidden_states_146, + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_56 = linear_127.view((1, 3, -1, 128)) + linear_127 = None + key_states_18 = view_56.transpose(1, 2) + view_56 = None + linear_128 = torch._C._nn.linear( + hidden_states_146, + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_146 = l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_57 = linear_128.view((1, 3, -1, 128)) + linear_128 = None + value_states_18 = view_57.transpose(1, 2) + view_57 = None + cos_21 = cos_2.unsqueeze(1) + sin_21 = sin_2.unsqueeze(1) + mul_167 = query_states_18 * cos_21 + x1_36 = query_states_18[(Ellipsis, slice(None, 64, None))] + x2_36 = query_states_18[(Ellipsis, slice(64, None, None))] + query_states_18 = None + neg_36 = -x2_36 + x2_36 = None + cat_37 = torch.cat((neg_36, x1_36), dim=-1) + neg_36 = x1_36 = None + mul_168 = cat_37 * sin_21 + cat_37 = None + q_embed_18 = mul_167 + mul_168 + mul_167 = mul_168 = None + mul_169 = key_states_18 * cos_21 + cos_21 = None + x1_37 = key_states_18[(Ellipsis, slice(None, 64, None))] + x2_37 = key_states_18[(Ellipsis, slice(64, None, None))] + key_states_18 = None + neg_37 = -x2_37 + x2_37 = None + cat_38 = torch.cat((neg_37, x1_37), dim=-1) + neg_37 = x1_37 = None + mul_170 = cat_38 * sin_21 + cat_38 = sin_21 = None + k_embed_18 = mul_169 + mul_170 + mul_169 = mul_170 = None + attention_mask_19 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_18 = q_embed_18.contiguous() + q_embed_18 = None + key_18 = k_embed_18.contiguous() + value_18 = value_states_18.contiguous() + attn_output_72 = torch._C._nn.scaled_dot_product_attention( + query_18, + key_18, + value_18, + attn_mask=attention_mask_19, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_18 = key_18 = value_18 = attention_mask_19 = None + transpose_76 = attn_output_72.transpose(1, 2) + attn_output_72 = None + attn_output_73 = transpose_76.contiguous() + transpose_76 = None + reshape_18 = attn_output_73.reshape(1, 3, -1) + attn_output_73 = None + attn_output_74 = reshape_18.contiguous() + reshape_18 = None + attn_output_75 = torch._C._nn.linear( + attn_output_74, + l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_74 = l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_147 = hidden_states_143 + attn_output_75 + hidden_states_143 = attn_output_75 = None + hidden_states_148 = hidden_states_147.to(torch.float32) + pow_38 = hidden_states_148.pow(2) + variance_37 = pow_38.mean(-1, keepdim=True) + pow_38 = None + add_112 = variance_37 + 1e-06 + variance_37 = None + rsqrt_37 = torch.rsqrt(add_112) + add_112 = None + hidden_states_149 = hidden_states_148 * rsqrt_37 + hidden_states_148 = rsqrt_37 = None + to_79 = hidden_states_149.to(torch.bfloat16) + hidden_states_149 = None + hidden_states_150 = ( + l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + * to_79 + ) + l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = ( + to_79 + ) = None + linear_130 = torch._C._nn.linear( + hidden_states_150, + l_self_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_18 = torch.nn.functional.silu(linear_130, inplace=False) + linear_130 = None + linear_131 = torch._C._nn.linear( + hidden_states_150, + l_self_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_150 = l_self_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_173 = silu_18 * linear_131 + silu_18 = linear_131 = None + down_proj_18 = torch._C._nn.linear( + mul_173, + l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_173 = l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_151 = hidden_states_147 + down_proj_18 + hidden_states_147 = down_proj_18 = None + hidden_states_152 = hidden_states_151.to(torch.float32) + pow_39 = hidden_states_152.pow(2) + variance_38 = pow_39.mean(-1, keepdim=True) + pow_39 = None + add_114 = variance_38 + 1e-06 + variance_38 = None + rsqrt_38 = torch.rsqrt(add_114) + add_114 = None + hidden_states_153 = hidden_states_152 * rsqrt_38 + hidden_states_152 = rsqrt_38 = None + to_81 = hidden_states_153.to(torch.bfloat16) + hidden_states_153 = None + hidden_states_154 = ( + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + * to_81 + ) + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + to_81 + ) = None + linear_133 = torch._C._nn.linear( + hidden_states_154, + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_58 = linear_133.view((1, 3, -1, 128)) + linear_133 = None + query_states_19 = view_58.transpose(1, 2) + view_58 = None + linear_134 = torch._C._nn.linear( + hidden_states_154, + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_59 = linear_134.view((1, 3, -1, 128)) + linear_134 = None + key_states_19 = view_59.transpose(1, 2) + view_59 = None + linear_135 = torch._C._nn.linear( + hidden_states_154, + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_154 = l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_60 = linear_135.view((1, 3, -1, 128)) + linear_135 = None + value_states_19 = view_60.transpose(1, 2) + view_60 = None + cos_22 = cos_2.unsqueeze(1) + sin_22 = sin_2.unsqueeze(1) + mul_176 = query_states_19 * cos_22 + x1_38 = query_states_19[(Ellipsis, slice(None, 64, None))] + x2_38 = query_states_19[(Ellipsis, slice(64, None, None))] + query_states_19 = None + neg_38 = -x2_38 + x2_38 = None + cat_39 = torch.cat((neg_38, x1_38), dim=-1) + neg_38 = x1_38 = None + mul_177 = cat_39 * sin_22 + cat_39 = None + q_embed_19 = mul_176 + mul_177 + mul_176 = mul_177 = None + mul_178 = key_states_19 * cos_22 + cos_22 = None + x1_39 = key_states_19[(Ellipsis, slice(None, 64, None))] + x2_39 = key_states_19[(Ellipsis, slice(64, None, None))] + key_states_19 = None + neg_39 = -x2_39 + x2_39 = None + cat_40 = torch.cat((neg_39, x1_39), dim=-1) + neg_39 = x1_39 = None + mul_179 = cat_40 * sin_22 + cat_40 = sin_22 = None + k_embed_19 = mul_178 + mul_179 + mul_178 = mul_179 = None + attention_mask_20 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_19 = q_embed_19.contiguous() + q_embed_19 = None + key_19 = k_embed_19.contiguous() + value_19 = value_states_19.contiguous() + attn_output_76 = torch._C._nn.scaled_dot_product_attention( + query_19, + key_19, + value_19, + attn_mask=attention_mask_20, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_19 = key_19 = value_19 = attention_mask_20 = None + transpose_80 = attn_output_76.transpose(1, 2) + attn_output_76 = None + attn_output_77 = transpose_80.contiguous() + transpose_80 = None + reshape_19 = attn_output_77.reshape(1, 3, -1) + attn_output_77 = None + attn_output_78 = reshape_19.contiguous() + reshape_19 = None + attn_output_79 = torch._C._nn.linear( + attn_output_78, + l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_78 = l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_155 = hidden_states_151 + attn_output_79 + hidden_states_151 = attn_output_79 = None + hidden_states_156 = hidden_states_155.to(torch.float32) + pow_40 = hidden_states_156.pow(2) + variance_39 = pow_40.mean(-1, keepdim=True) + pow_40 = None + add_118 = variance_39 + 1e-06 + variance_39 = None + rsqrt_39 = torch.rsqrt(add_118) + add_118 = None + hidden_states_157 = hidden_states_156 * rsqrt_39 + hidden_states_156 = rsqrt_39 = None + to_83 = hidden_states_157.to(torch.bfloat16) + hidden_states_157 = None + hidden_states_158 = ( + l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + * to_83 + ) + l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = ( + to_83 + ) = None + linear_137 = torch._C._nn.linear( + hidden_states_158, + l_self_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_19 = torch.nn.functional.silu(linear_137, inplace=False) + linear_137 = None + linear_138 = torch._C._nn.linear( + hidden_states_158, + l_self_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_158 = l_self_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_182 = silu_19 * linear_138 + silu_19 = linear_138 = None + down_proj_19 = torch._C._nn.linear( + mul_182, + l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_182 = l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_159 = hidden_states_155 + down_proj_19 + hidden_states_155 = down_proj_19 = None + hidden_states_160 = hidden_states_159.to(torch.float32) + pow_41 = hidden_states_160.pow(2) + variance_40 = pow_41.mean(-1, keepdim=True) + pow_41 = None + add_120 = variance_40 + 1e-06 + variance_40 = None + rsqrt_40 = torch.rsqrt(add_120) + add_120 = None + hidden_states_161 = hidden_states_160 * rsqrt_40 + hidden_states_160 = rsqrt_40 = None + to_85 = hidden_states_161.to(torch.bfloat16) + hidden_states_161 = None + hidden_states_162 = ( + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + * to_85 + ) + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + to_85 + ) = None + linear_140 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_61 = linear_140.view((1, 3, -1, 128)) + linear_140 = None + query_states_20 = view_61.transpose(1, 2) + view_61 = None + linear_141 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_62 = linear_141.view((1, 3, -1, 128)) + linear_141 = None + key_states_20 = view_62.transpose(1, 2) + view_62 = None + linear_142 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_162 = l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_63 = linear_142.view((1, 3, -1, 128)) + linear_142 = None + value_states_20 = view_63.transpose(1, 2) + view_63 = None + cos_23 = cos_2.unsqueeze(1) + sin_23 = sin_2.unsqueeze(1) + mul_185 = query_states_20 * cos_23 + x1_40 = query_states_20[(Ellipsis, slice(None, 64, None))] + x2_40 = query_states_20[(Ellipsis, slice(64, None, None))] + query_states_20 = None + neg_40 = -x2_40 + x2_40 = None + cat_41 = torch.cat((neg_40, x1_40), dim=-1) + neg_40 = x1_40 = None + mul_186 = cat_41 * sin_23 + cat_41 = None + q_embed_20 = mul_185 + mul_186 + mul_185 = mul_186 = None + mul_187 = key_states_20 * cos_23 + cos_23 = None + x1_41 = key_states_20[(Ellipsis, slice(None, 64, None))] + x2_41 = key_states_20[(Ellipsis, slice(64, None, None))] + key_states_20 = None + neg_41 = -x2_41 + x2_41 = None + cat_42 = torch.cat((neg_41, x1_41), dim=-1) + neg_41 = x1_41 = None + mul_188 = cat_42 * sin_23 + cat_42 = sin_23 = None + k_embed_20 = mul_187 + mul_188 + mul_187 = mul_188 = None + attention_mask_21 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_20 = q_embed_20.contiguous() + q_embed_20 = None + key_20 = k_embed_20.contiguous() + value_20 = value_states_20.contiguous() + attn_output_80 = torch._C._nn.scaled_dot_product_attention( + query_20, + key_20, + value_20, + attn_mask=attention_mask_21, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_20 = key_20 = value_20 = attention_mask_21 = None + transpose_84 = attn_output_80.transpose(1, 2) + attn_output_80 = None + attn_output_81 = transpose_84.contiguous() + transpose_84 = None + reshape_20 = attn_output_81.reshape(1, 3, -1) + attn_output_81 = None + attn_output_82 = reshape_20.contiguous() + reshape_20 = None + attn_output_83 = torch._C._nn.linear( + attn_output_82, + l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_82 = l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_163 = hidden_states_159 + attn_output_83 + hidden_states_159 = attn_output_83 = None + hidden_states_164 = hidden_states_163.to(torch.float32) + pow_42 = hidden_states_164.pow(2) + variance_41 = pow_42.mean(-1, keepdim=True) + pow_42 = None + add_124 = variance_41 + 1e-06 + variance_41 = None + rsqrt_41 = torch.rsqrt(add_124) + add_124 = None + hidden_states_165 = hidden_states_164 * rsqrt_41 + hidden_states_164 = rsqrt_41 = None + to_87 = hidden_states_165.to(torch.bfloat16) + hidden_states_165 = None + hidden_states_166 = ( + l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + * to_87 + ) + l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = ( + to_87 + ) = None + linear_144 = torch._C._nn.linear( + hidden_states_166, + l_self_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_20 = torch.nn.functional.silu(linear_144, inplace=False) + linear_144 = None + linear_145 = torch._C._nn.linear( + hidden_states_166, + l_self_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_166 = l_self_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_191 = silu_20 * linear_145 + silu_20 = linear_145 = None + down_proj_20 = torch._C._nn.linear( + mul_191, + l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_191 = l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_167 = hidden_states_163 + down_proj_20 + hidden_states_163 = down_proj_20 = None + hidden_states_168 = hidden_states_167.to(torch.float32) + pow_43 = hidden_states_168.pow(2) + variance_42 = pow_43.mean(-1, keepdim=True) + pow_43 = None + add_126 = variance_42 + 1e-06 + variance_42 = None + rsqrt_42 = torch.rsqrt(add_126) + add_126 = None + hidden_states_169 = hidden_states_168 * rsqrt_42 + hidden_states_168 = rsqrt_42 = None + to_89 = hidden_states_169.to(torch.bfloat16) + hidden_states_169 = None + hidden_states_170 = ( + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + * to_89 + ) + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + to_89 + ) = None + linear_147 = torch._C._nn.linear( + hidden_states_170, + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_64 = linear_147.view((1, 3, -1, 128)) + linear_147 = None + query_states_21 = view_64.transpose(1, 2) + view_64 = None + linear_148 = torch._C._nn.linear( + hidden_states_170, + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_65 = linear_148.view((1, 3, -1, 128)) + linear_148 = None + key_states_21 = view_65.transpose(1, 2) + view_65 = None + linear_149 = torch._C._nn.linear( + hidden_states_170, + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_170 = l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_66 = linear_149.view((1, 3, -1, 128)) + linear_149 = None + value_states_21 = view_66.transpose(1, 2) + view_66 = None + cos_24 = cos_2.unsqueeze(1) + sin_24 = sin_2.unsqueeze(1) + mul_194 = query_states_21 * cos_24 + x1_42 = query_states_21[(Ellipsis, slice(None, 64, None))] + x2_42 = query_states_21[(Ellipsis, slice(64, None, None))] + query_states_21 = None + neg_42 = -x2_42 + x2_42 = None + cat_43 = torch.cat((neg_42, x1_42), dim=-1) + neg_42 = x1_42 = None + mul_195 = cat_43 * sin_24 + cat_43 = None + q_embed_21 = mul_194 + mul_195 + mul_194 = mul_195 = None + mul_196 = key_states_21 * cos_24 + cos_24 = None + x1_43 = key_states_21[(Ellipsis, slice(None, 64, None))] + x2_43 = key_states_21[(Ellipsis, slice(64, None, None))] + key_states_21 = None + neg_43 = -x2_43 + x2_43 = None + cat_44 = torch.cat((neg_43, x1_43), dim=-1) + neg_43 = x1_43 = None + mul_197 = cat_44 * sin_24 + cat_44 = sin_24 = None + k_embed_21 = mul_196 + mul_197 + mul_196 = mul_197 = None + attention_mask_22 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_21 = q_embed_21.contiguous() + q_embed_21 = None + key_21 = k_embed_21.contiguous() + value_21 = value_states_21.contiguous() + attn_output_84 = torch._C._nn.scaled_dot_product_attention( + query_21, + key_21, + value_21, + attn_mask=attention_mask_22, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_21 = key_21 = value_21 = attention_mask_22 = None + transpose_88 = attn_output_84.transpose(1, 2) + attn_output_84 = None + attn_output_85 = transpose_88.contiguous() + transpose_88 = None + reshape_21 = attn_output_85.reshape(1, 3, -1) + attn_output_85 = None + attn_output_86 = reshape_21.contiguous() + reshape_21 = None + attn_output_87 = torch._C._nn.linear( + attn_output_86, + l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_86 = l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_171 = hidden_states_167 + attn_output_87 + hidden_states_167 = attn_output_87 = None + hidden_states_172 = hidden_states_171.to(torch.float32) + pow_44 = hidden_states_172.pow(2) + variance_43 = pow_44.mean(-1, keepdim=True) + pow_44 = None + add_130 = variance_43 + 1e-06 + variance_43 = None + rsqrt_43 = torch.rsqrt(add_130) + add_130 = None + hidden_states_173 = hidden_states_172 * rsqrt_43 + hidden_states_172 = rsqrt_43 = None + to_91 = hidden_states_173.to(torch.bfloat16) + hidden_states_173 = None + hidden_states_174 = ( + l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + * to_91 + ) + l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = ( + to_91 + ) = None + linear_151 = torch._C._nn.linear( + hidden_states_174, + l_self_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_21 = torch.nn.functional.silu(linear_151, inplace=False) + linear_151 = None + linear_152 = torch._C._nn.linear( + hidden_states_174, + l_self_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_174 = l_self_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_200 = silu_21 * linear_152 + silu_21 = linear_152 = None + down_proj_21 = torch._C._nn.linear( + mul_200, + l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_200 = l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_175 = hidden_states_171 + down_proj_21 + hidden_states_171 = down_proj_21 = None + hidden_states_176 = hidden_states_175.to(torch.float32) + pow_45 = hidden_states_176.pow(2) + variance_44 = pow_45.mean(-1, keepdim=True) + pow_45 = None + add_132 = variance_44 + 1e-06 + variance_44 = None + rsqrt_44 = torch.rsqrt(add_132) + add_132 = None + hidden_states_177 = hidden_states_176 * rsqrt_44 + hidden_states_176 = rsqrt_44 = None + to_93 = hidden_states_177.to(torch.bfloat16) + hidden_states_177 = None + hidden_states_178 = ( + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + * to_93 + ) + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + to_93 + ) = None + linear_154 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_67 = linear_154.view((1, 3, -1, 128)) + linear_154 = None + query_states_22 = view_67.transpose(1, 2) + view_67 = None + linear_155 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_68 = linear_155.view((1, 3, -1, 128)) + linear_155 = None + key_states_22 = view_68.transpose(1, 2) + view_68 = None + linear_156 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_178 = l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_69 = linear_156.view((1, 3, -1, 128)) + linear_156 = None + value_states_22 = view_69.transpose(1, 2) + view_69 = None + cos_25 = cos_2.unsqueeze(1) + sin_25 = sin_2.unsqueeze(1) + mul_203 = query_states_22 * cos_25 + x1_44 = query_states_22[(Ellipsis, slice(None, 64, None))] + x2_44 = query_states_22[(Ellipsis, slice(64, None, None))] + query_states_22 = None + neg_44 = -x2_44 + x2_44 = None + cat_45 = torch.cat((neg_44, x1_44), dim=-1) + neg_44 = x1_44 = None + mul_204 = cat_45 * sin_25 + cat_45 = None + q_embed_22 = mul_203 + mul_204 + mul_203 = mul_204 = None + mul_205 = key_states_22 * cos_25 + cos_25 = None + x1_45 = key_states_22[(Ellipsis, slice(None, 64, None))] + x2_45 = key_states_22[(Ellipsis, slice(64, None, None))] + key_states_22 = None + neg_45 = -x2_45 + x2_45 = None + cat_46 = torch.cat((neg_45, x1_45), dim=-1) + neg_45 = x1_45 = None + mul_206 = cat_46 * sin_25 + cat_46 = sin_25 = None + k_embed_22 = mul_205 + mul_206 + mul_205 = mul_206 = None + attention_mask_23 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_22 = q_embed_22.contiguous() + q_embed_22 = None + key_22 = k_embed_22.contiguous() + value_22 = value_states_22.contiguous() + attn_output_88 = torch._C._nn.scaled_dot_product_attention( + query_22, + key_22, + value_22, + attn_mask=attention_mask_23, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_22 = key_22 = value_22 = attention_mask_23 = None + transpose_92 = attn_output_88.transpose(1, 2) + attn_output_88 = None + attn_output_89 = transpose_92.contiguous() + transpose_92 = None + reshape_22 = attn_output_89.reshape(1, 3, -1) + attn_output_89 = None + attn_output_90 = reshape_22.contiguous() + reshape_22 = None + attn_output_91 = torch._C._nn.linear( + attn_output_90, + l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_90 = l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_179 = hidden_states_175 + attn_output_91 + hidden_states_175 = attn_output_91 = None + hidden_states_180 = hidden_states_179.to(torch.float32) + pow_46 = hidden_states_180.pow(2) + variance_45 = pow_46.mean(-1, keepdim=True) + pow_46 = None + add_136 = variance_45 + 1e-06 + variance_45 = None + rsqrt_45 = torch.rsqrt(add_136) + add_136 = None + hidden_states_181 = hidden_states_180 * rsqrt_45 + hidden_states_180 = rsqrt_45 = None + to_95 = hidden_states_181.to(torch.bfloat16) + hidden_states_181 = None + hidden_states_182 = ( + l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ + * to_95 + ) + l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = ( + to_95 + ) = None + linear_158 = torch._C._nn.linear( + hidden_states_182, + l_self_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_22 = torch.nn.functional.silu(linear_158, inplace=False) + linear_158 = None + linear_159 = torch._C._nn.linear( + hidden_states_182, + l_self_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_182 = l_self_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_209 = silu_22 * linear_159 + silu_22 = linear_159 = None + down_proj_22 = torch._C._nn.linear( + mul_209, + l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_209 = l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_183 = hidden_states_179 + down_proj_22 + hidden_states_179 = down_proj_22 = None + hidden_states_184 = hidden_states_183.to(torch.float32) + pow_47 = hidden_states_184.pow(2) + variance_46 = pow_47.mean(-1, keepdim=True) + pow_47 = None + add_138 = variance_46 + 1e-06 + variance_46 = None + rsqrt_46 = torch.rsqrt(add_138) + add_138 = None + hidden_states_185 = hidden_states_184 * rsqrt_46 + hidden_states_184 = rsqrt_46 = None + to_97 = hidden_states_185.to(torch.bfloat16) + hidden_states_185 = None + hidden_states_186 = ( + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + * to_97 + ) + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + to_97 + ) = None + linear_161 = torch._C._nn.linear( + hidden_states_186, + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_70 = linear_161.view((1, 3, -1, 128)) + linear_161 = None + query_states_23 = view_70.transpose(1, 2) + view_70 = None + linear_162 = torch._C._nn.linear( + hidden_states_186, + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_71 = linear_162.view((1, 3, -1, 128)) + linear_162 = None + key_states_23 = view_71.transpose(1, 2) + view_71 = None + linear_163 = torch._C._nn.linear( + hidden_states_186, + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_186 = l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_72 = linear_163.view((1, 3, -1, 128)) + linear_163 = None + value_states_23 = view_72.transpose(1, 2) + view_72 = None + cos_26 = cos_2.unsqueeze(1) + cos_2 = None + sin_26 = sin_2.unsqueeze(1) + sin_2 = None + mul_212 = query_states_23 * cos_26 + x1_46 = query_states_23[(Ellipsis, slice(None, 64, None))] + x2_46 = query_states_23[(Ellipsis, slice(64, None, None))] + query_states_23 = None + neg_46 = -x2_46 + x2_46 = None + cat_47 = torch.cat((neg_46, x1_46), dim=-1) + neg_46 = x1_46 = None + mul_213 = cat_47 * sin_26 + cat_47 = None + q_embed_23 = mul_212 + mul_213 + mul_212 = mul_213 = None + mul_214 = key_states_23 * cos_26 + cos_26 = None + x1_47 = key_states_23[(Ellipsis, slice(None, 64, None))] + x2_47 = key_states_23[(Ellipsis, slice(64, None, None))] + key_states_23 = None + neg_47 = -x2_47 + x2_47 = None + cat_48 = torch.cat((neg_47, x1_47), dim=-1) + neg_47 = x1_47 = None + mul_215 = cat_48 * sin_26 + cat_48 = sin_26 = None + k_embed_23 = mul_214 + mul_215 + mul_214 = mul_215 = None + attention_mask_24 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + causal_mask_2 = None + query_23 = q_embed_23.contiguous() + q_embed_23 = None + key_23 = k_embed_23.contiguous() + value_23 = value_states_23.contiguous() + attn_output_92 = torch._C._nn.scaled_dot_product_attention( + query_23, + key_23, + value_23, + attn_mask=attention_mask_24, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_23 = key_23 = value_23 = attention_mask_24 = None + transpose_96 = attn_output_92.transpose(1, 2) + attn_output_92 = None + attn_output_93 = transpose_96.contiguous() + transpose_96 = None + reshape_23 = attn_output_93.reshape(1, 3, -1) + attn_output_93 = None + attn_output_94 = reshape_23.contiguous() + reshape_23 = None + attn_output_95 = torch._C._nn.linear( + attn_output_94, + l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_94 = l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_187 = hidden_states_183 + attn_output_95 + hidden_states_183 = attn_output_95 = None + hidden_states_188 = hidden_states_187.to(torch.float32) + pow_48 = hidden_states_188.pow(2) + variance_47 = pow_48.mean(-1, keepdim=True) + pow_48 = None + add_142 = variance_47 + 1e-06 + variance_47 = None + rsqrt_47 = torch.rsqrt(add_142) + add_142 = None + hidden_states_189 = hidden_states_188 * rsqrt_47 + hidden_states_188 = rsqrt_47 = None + to_99 = hidden_states_189.to(torch.bfloat16) + hidden_states_189 = None + hidden_states_190 = ( + l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ + * to_99 + ) + l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = ( + to_99 + ) = None + linear_165 = torch._C._nn.linear( + hidden_states_190, + l_self_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_23 = torch.nn.functional.silu(linear_165, inplace=False) + linear_165 = None + linear_166 = torch._C._nn.linear( + hidden_states_190, + l_self_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_190 = l_self_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_218 = silu_23 * linear_166 + silu_23 = linear_166 = None + down_proj_23 = torch._C._nn.linear( + mul_218, + l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_218 = l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_191 = hidden_states_187 + down_proj_23 + hidden_states_187 = down_proj_23 = None + hidden_states_192 = hidden_states_191.to(torch.float32) + hidden_states_191 = None + pow_49 = hidden_states_192.pow(2) + variance_48 = pow_49.mean(-1, keepdim=True) + pow_49 = None + add_144 = variance_48 + 1e-06 + variance_48 = None + rsqrt_48 = torch.rsqrt(add_144) + add_144 = None + hidden_states_193 = hidden_states_192 * rsqrt_48 + hidden_states_192 = rsqrt_48 = None + to_101 = hidden_states_193.to(torch.bfloat16) + hidden_states_193 = None + hidden_states_194 = l_self_modules_norm_parameters_weight_ * to_101 + l_self_modules_norm_parameters_weight_ = to_101 = None + return ( + value_states, + k_embed, + value_states_1, + k_embed_1, + value_states_2, + k_embed_2, + value_states_3, + k_embed_3, + value_states_4, + k_embed_4, + value_states_5, + k_embed_5, + value_states_6, + k_embed_6, + value_states_7, + k_embed_7, + value_states_8, + k_embed_8, + value_states_9, + k_embed_9, + value_states_10, + k_embed_10, + value_states_11, + k_embed_11, + value_states_12, + k_embed_12, + value_states_13, + k_embed_13, + value_states_14, + k_embed_14, + value_states_15, + k_embed_15, + value_states_16, + k_embed_16, + value_states_17, + k_embed_17, + value_states_18, + k_embed_18, + value_states_19, + k_embed_19, + value_states_20, + k_embed_20, + value_states_21, + k_embed_21, + value_states_22, + k_embed_22, + value_states_23, + k_embed_23, + hidden_states_194, + ) diff --git a/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/weight_meta.py b/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/weight_meta.py new file mode 100644 index 000000000..c34788efb --- /dev/null +++ b/samples/transformers-auto-model/deepseek-ai/deepseek-coder-1.3b-base/weight_meta.py @@ -0,0 +1,2218 @@ +class Program_weight_tensor_meta_L_inputs_embeds_: + name = "L_inputs_embeds_" + shape = [1, 3, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 3] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1] + + +class Program_weight_tensor_meta_L_self_modules_rotary_emb_buffers_inv_freq_: + name = "L_self_modules_rotary_emb_buffers_inv_freq_" + shape = [64] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.024 + std = 0.052 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_: + name = ( + "L_self_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_" + ) + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5504, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5504] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_norm_parameters_weight_: + name = "L_self_modules_norm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/graph_hash.txt b/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/graph_hash.txt new file mode 100644 index 000000000..7f1d2c7d1 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/graph_hash.txt @@ -0,0 +1 @@ +a35cb111813677b2a3cab5e9c735cf5aeba61421fb90ca0b3b4c15f4d0d1700a \ No newline at end of file diff --git a/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/graph_net.json b/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/input_meta.py b/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/input_tensor_constraints.py b/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/model.py b/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/model.py new file mode 100644 index 000000000..cb23865a9 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/model.py @@ -0,0 +1,5817 @@ +import torch + +from torch import device + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_inputs_embeds_: torch.Tensor, + L_attention_mask_: torch.Tensor, + L_self_modules_rotary_emb_buffers_inv_freq_: torch.Tensor, + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_norm_parameters_weight_: torch.nn.parameter.Parameter, + ): + l_inputs_embeds_ = L_inputs_embeds_ + l_attention_mask_ = L_attention_mask_ + l_self_modules_rotary_emb_buffers_inv_freq_ = ( + L_self_modules_rotary_emb_buffers_inv_freq_ + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_norm_parameters_weight_ = L_self_modules_norm_parameters_weight_ + cache_position = torch.arange(0, 2, device=device(type="cuda", index=0)) + position_ids = cache_position.unsqueeze(0) + attention_mask = l_attention_mask_.to( + device=device(type="cuda", index=0), dtype=torch.bool + ) + l_attention_mask_ = None + mask_indices = torch.arange(2, device=device(type="cuda", index=0)) + mask_indices += 0 + mask_indices_1 = mask_indices + mask_indices = None + local_padding_mask = attention_mask[(slice(None, None, None), mask_indices_1)] + attention_mask = mask_indices_1 = None + kv_arange = torch.arange(2, device=device(type="cuda", index=0)) + kv_arange += 0 + kv_arange_1 = kv_arange + kv_arange = None + reshaped_cache_position = cache_position.view(-1, 1) + cache_position = None + causal_mask = kv_arange_1 <= reshaped_cache_position + sub = reshaped_cache_position - 2047 + reshaped_cache_position = None + sliding_mask_overlay = kv_arange_1 > sub + kv_arange_1 = sub = None + causal_mask *= sliding_mask_overlay + causal_mask_1 = causal_mask + causal_mask = sliding_mask_overlay = None + getitem_1 = causal_mask_1[ + (None, None, slice(None, None, None), slice(None, None, None)) + ] + causal_mask_1 = None + causal_mask_2 = getitem_1.expand(1, -1, -1, -1) + getitem_1 = None + getitem_2 = local_padding_mask[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + local_padding_mask = None + causal_mask_3 = causal_mask_2 * getitem_2 + causal_mask_2 = getitem_2 = None + _set_grad_enabled = torch._C._set_grad_enabled(False) + _set_grad_enabled = None + getitem_3 = l_self_modules_rotary_emb_buffers_inv_freq_[ + (None, slice(None, None, None), None) + ] + l_self_modules_rotary_emb_buffers_inv_freq_ = None + float_1 = getitem_3.float() + getitem_3 = None + expand_1 = float_1.expand(1, -1, 1) + float_1 = None + inv_freq_expanded = expand_1.to(device(type="cuda", index=0)) + expand_1 = None + getitem_4 = position_ids[ + (slice(None, None, None), None, slice(None, None, None)) + ] + position_ids = None + position_ids_expanded = getitem_4.float() + getitem_4 = None + float_3 = inv_freq_expanded.float() + inv_freq_expanded = None + float_4 = position_ids_expanded.float() + position_ids_expanded = None + matmul = float_3 @ float_4 + float_3 = float_4 = None + freqs = matmul.transpose(1, 2) + matmul = None + emb = torch.cat((freqs, freqs), dim=-1) + freqs = None + cos = emb.cos() + cos_1 = cos * 1.0 + cos = None + sin = emb.sin() + emb = None + sin_1 = sin * 1.0 + sin = None + cos_2 = cos_1.to(dtype=torch.bfloat16) + cos_1 = None + sin_2 = sin_1.to(dtype=torch.bfloat16) + sin_1 = None + _set_grad_enabled_1 = torch._C._set_grad_enabled(True) + _set_grad_enabled_1 = None + _log_api_usage_once = torch._C._log_api_usage_once("python.nn_module") + _log_api_usage_once = None + hidden_states = l_inputs_embeds_.to(torch.float32) + pow_1 = hidden_states.pow(2) + variance = pow_1.mean(-1, keepdim=True) + pow_1 = None + add = variance + 1e-05 + variance = None + rsqrt = torch.rsqrt(add) + add = None + hidden_states_1 = hidden_states * rsqrt + hidden_states = rsqrt = None + to_5 = hidden_states_1.to(torch.bfloat16) + hidden_states_1 = None + hidden_states_2 = ( + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + * to_5 + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + to_5 + ) = None + qkv = torch._C._nn.linear( + hidden_states_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_2 = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states = qkv[(Ellipsis, slice(None, 3072, None))] + key_states = qkv[(Ellipsis, slice(3072, 6144, None))] + value_states = qkv[(Ellipsis, slice(6144, None, None))] + qkv = None + view_1 = query_states.view((1, 2, -1, 96)) + query_states = None + query_states_1 = view_1.transpose(1, 2) + view_1 = None + view_2 = key_states.view((1, 2, -1, 96)) + key_states = None + key_states_1 = view_2.transpose(1, 2) + view_2 = None + view_3 = value_states.view((1, 2, -1, 96)) + value_states = None + value_states_1 = view_3.transpose(1, 2) + view_3 = None + cos_3 = cos_2.unsqueeze(1) + sin_3 = sin_2.unsqueeze(1) + q_rot = query_states_1[(Ellipsis, slice(None, 96, None))] + q_pass = query_states_1[(Ellipsis, slice(96, None, None))] + query_states_1 = None + k_rot = key_states_1[(Ellipsis, slice(None, 96, None))] + k_pass = key_states_1[(Ellipsis, slice(96, None, None))] + key_states_1 = None + mul_5 = q_rot * cos_3 + x1 = q_rot[(Ellipsis, slice(None, 48, None))] + x2 = q_rot[(Ellipsis, slice(48, None, None))] + q_rot = None + neg = -x2 + x2 = None + cat_1 = torch.cat((neg, x1), dim=-1) + neg = x1 = None + mul_6 = cat_1 * sin_3 + cat_1 = None + add_1 = mul_5 + mul_6 + mul_5 = mul_6 = None + q_embed = torch.cat([add_1, q_pass], dim=-1) + add_1 = q_pass = None + mul_7 = k_rot * cos_3 + cos_3 = None + x1_1 = k_rot[(Ellipsis, slice(None, 48, None))] + x2_1 = k_rot[(Ellipsis, slice(48, None, None))] + k_rot = None + neg_1 = -x2_1 + x2_1 = None + cat_3 = torch.cat((neg_1, x1_1), dim=-1) + neg_1 = x1_1 = None + mul_8 = cat_3 * sin_3 + cat_3 = sin_3 = None + add_2 = mul_7 + mul_8 + mul_7 = mul_8 = None + k_embed = torch.cat([add_2, k_pass], dim=-1) + add_2 = k_pass = None + attention_mask_1 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query = q_embed.contiguous() + q_embed = None + key = k_embed.contiguous() + value = value_states_1.contiguous() + attn_output = torch._C._nn.scaled_dot_product_attention( + query, + key, + value, + attn_mask=attention_mask_1, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query = key = value = attention_mask_1 = None + transpose_4 = attn_output.transpose(1, 2) + attn_output = None + attn_output_1 = transpose_4.contiguous() + transpose_4 = None + reshape = attn_output_1.reshape(1, 2, -1) + attn_output_1 = None + attn_output_2 = reshape.contiguous() + reshape = None + attn_output_3 = torch._C._nn.linear( + attn_output_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_2 = l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout = torch.nn.functional.dropout(attn_output_3, 0.0, False, False) + attn_output_3 = None + hidden_states_3 = l_inputs_embeds_ + dropout + l_inputs_embeds_ = dropout = None + hidden_states_4 = hidden_states_3.to(torch.float32) + pow_2 = hidden_states_4.pow(2) + variance_1 = pow_2.mean(-1, keepdim=True) + pow_2 = None + add_4 = variance_1 + 1e-05 + variance_1 = None + rsqrt_1 = torch.rsqrt(add_4) + add_4 = None + hidden_states_5 = hidden_states_4 * rsqrt_1 + hidden_states_4 = rsqrt_1 = None + to_7 = hidden_states_5.to(torch.bfloat16) + hidden_states_5 = None + hidden_states_6 = ( + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + * to_7 + ) + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = ( + to_7 + ) = None + up_states = torch._C._nn.linear( + hidden_states_6, + l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_6 = l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk = up_states.chunk(2, dim=-1) + up_states = None + gate = chunk[0] + up_states_1 = chunk[1] + chunk = None + silu = torch.nn.functional.silu(gate, inplace=False) + gate = None + up_states_2 = up_states_1 * silu + up_states_1 = silu = None + hidden_states_7 = torch._C._nn.linear( + up_states_2, + l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_2 = l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_1 = torch.nn.functional.dropout(hidden_states_7, 0.0, False, False) + hidden_states_7 = None + hidden_states_8 = hidden_states_3 + dropout_1 + hidden_states_3 = dropout_1 = None + hidden_states_9 = hidden_states_8.to(torch.float32) + pow_3 = hidden_states_9.pow(2) + variance_2 = pow_3.mean(-1, keepdim=True) + pow_3 = None + add_6 = variance_2 + 1e-05 + variance_2 = None + rsqrt_2 = torch.rsqrt(add_6) + add_6 = None + hidden_states_10 = hidden_states_9 * rsqrt_2 + hidden_states_9 = rsqrt_2 = None + to_9 = hidden_states_10.to(torch.bfloat16) + hidden_states_10 = None + hidden_states_11 = ( + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + * to_9 + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + to_9 + ) = None + qkv_1 = torch._C._nn.linear( + hidden_states_11, + l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_11 = l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_2 = qkv_1[(Ellipsis, slice(None, 3072, None))] + key_states_2 = qkv_1[(Ellipsis, slice(3072, 6144, None))] + value_states_2 = qkv_1[(Ellipsis, slice(6144, None, None))] + qkv_1 = None + view_4 = query_states_2.view((1, 2, -1, 96)) + query_states_2 = None + query_states_3 = view_4.transpose(1, 2) + view_4 = None + view_5 = key_states_2.view((1, 2, -1, 96)) + key_states_2 = None + key_states_3 = view_5.transpose(1, 2) + view_5 = None + view_6 = value_states_2.view((1, 2, -1, 96)) + value_states_2 = None + value_states_3 = view_6.transpose(1, 2) + view_6 = None + cos_4 = cos_2.unsqueeze(1) + sin_4 = sin_2.unsqueeze(1) + q_rot_1 = query_states_3[(Ellipsis, slice(None, 96, None))] + q_pass_1 = query_states_3[(Ellipsis, slice(96, None, None))] + query_states_3 = None + k_rot_1 = key_states_3[(Ellipsis, slice(None, 96, None))] + k_pass_1 = key_states_3[(Ellipsis, slice(96, None, None))] + key_states_3 = None + mul_14 = q_rot_1 * cos_4 + x1_2 = q_rot_1[(Ellipsis, slice(None, 48, None))] + x2_2 = q_rot_1[(Ellipsis, slice(48, None, None))] + q_rot_1 = None + neg_2 = -x2_2 + x2_2 = None + cat_5 = torch.cat((neg_2, x1_2), dim=-1) + neg_2 = x1_2 = None + mul_15 = cat_5 * sin_4 + cat_5 = None + add_7 = mul_14 + mul_15 + mul_14 = mul_15 = None + q_embed_1 = torch.cat([add_7, q_pass_1], dim=-1) + add_7 = q_pass_1 = None + mul_16 = k_rot_1 * cos_4 + cos_4 = None + x1_3 = k_rot_1[(Ellipsis, slice(None, 48, None))] + x2_3 = k_rot_1[(Ellipsis, slice(48, None, None))] + k_rot_1 = None + neg_3 = -x2_3 + x2_3 = None + cat_7 = torch.cat((neg_3, x1_3), dim=-1) + neg_3 = x1_3 = None + mul_17 = cat_7 * sin_4 + cat_7 = sin_4 = None + add_8 = mul_16 + mul_17 + mul_16 = mul_17 = None + k_embed_1 = torch.cat([add_8, k_pass_1], dim=-1) + add_8 = k_pass_1 = None + attention_mask_2 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_1 = q_embed_1.contiguous() + q_embed_1 = None + key_1 = k_embed_1.contiguous() + value_1 = value_states_3.contiguous() + attn_output_4 = torch._C._nn.scaled_dot_product_attention( + query_1, + key_1, + value_1, + attn_mask=attention_mask_2, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_1 = key_1 = value_1 = attention_mask_2 = None + transpose_8 = attn_output_4.transpose(1, 2) + attn_output_4 = None + attn_output_5 = transpose_8.contiguous() + transpose_8 = None + reshape_1 = attn_output_5.reshape(1, 2, -1) + attn_output_5 = None + attn_output_6 = reshape_1.contiguous() + reshape_1 = None + attn_output_7 = torch._C._nn.linear( + attn_output_6, + l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_6 = l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_2 = torch.nn.functional.dropout(attn_output_7, 0.0, False, False) + attn_output_7 = None + hidden_states_12 = hidden_states_8 + dropout_2 + hidden_states_8 = dropout_2 = None + hidden_states_13 = hidden_states_12.to(torch.float32) + pow_4 = hidden_states_13.pow(2) + variance_3 = pow_4.mean(-1, keepdim=True) + pow_4 = None + add_10 = variance_3 + 1e-05 + variance_3 = None + rsqrt_3 = torch.rsqrt(add_10) + add_10 = None + hidden_states_14 = hidden_states_13 * rsqrt_3 + hidden_states_13 = rsqrt_3 = None + to_11 = hidden_states_14.to(torch.bfloat16) + hidden_states_14 = None + hidden_states_15 = ( + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + * to_11 + ) + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = ( + to_11 + ) = None + up_states_3 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_15 = l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_1 = up_states_3.chunk(2, dim=-1) + up_states_3 = None + gate_1 = chunk_1[0] + up_states_4 = chunk_1[1] + chunk_1 = None + silu_1 = torch.nn.functional.silu(gate_1, inplace=False) + gate_1 = None + up_states_5 = up_states_4 * silu_1 + up_states_4 = silu_1 = None + hidden_states_16 = torch._C._nn.linear( + up_states_5, + l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_5 = l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_3 = torch.nn.functional.dropout(hidden_states_16, 0.0, False, False) + hidden_states_16 = None + hidden_states_17 = hidden_states_12 + dropout_3 + hidden_states_12 = dropout_3 = None + hidden_states_18 = hidden_states_17.to(torch.float32) + pow_5 = hidden_states_18.pow(2) + variance_4 = pow_5.mean(-1, keepdim=True) + pow_5 = None + add_12 = variance_4 + 1e-05 + variance_4 = None + rsqrt_4 = torch.rsqrt(add_12) + add_12 = None + hidden_states_19 = hidden_states_18 * rsqrt_4 + hidden_states_18 = rsqrt_4 = None + to_13 = hidden_states_19.to(torch.bfloat16) + hidden_states_19 = None + hidden_states_20 = ( + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + * to_13 + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + to_13 + ) = None + qkv_2 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_20 = l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_4 = qkv_2[(Ellipsis, slice(None, 3072, None))] + key_states_4 = qkv_2[(Ellipsis, slice(3072, 6144, None))] + value_states_4 = qkv_2[(Ellipsis, slice(6144, None, None))] + qkv_2 = None + view_7 = query_states_4.view((1, 2, -1, 96)) + query_states_4 = None + query_states_5 = view_7.transpose(1, 2) + view_7 = None + view_8 = key_states_4.view((1, 2, -1, 96)) + key_states_4 = None + key_states_5 = view_8.transpose(1, 2) + view_8 = None + view_9 = value_states_4.view((1, 2, -1, 96)) + value_states_4 = None + value_states_5 = view_9.transpose(1, 2) + view_9 = None + cos_5 = cos_2.unsqueeze(1) + sin_5 = sin_2.unsqueeze(1) + q_rot_2 = query_states_5[(Ellipsis, slice(None, 96, None))] + q_pass_2 = query_states_5[(Ellipsis, slice(96, None, None))] + query_states_5 = None + k_rot_2 = key_states_5[(Ellipsis, slice(None, 96, None))] + k_pass_2 = key_states_5[(Ellipsis, slice(96, None, None))] + key_states_5 = None + mul_23 = q_rot_2 * cos_5 + x1_4 = q_rot_2[(Ellipsis, slice(None, 48, None))] + x2_4 = q_rot_2[(Ellipsis, slice(48, None, None))] + q_rot_2 = None + neg_4 = -x2_4 + x2_4 = None + cat_9 = torch.cat((neg_4, x1_4), dim=-1) + neg_4 = x1_4 = None + mul_24 = cat_9 * sin_5 + cat_9 = None + add_13 = mul_23 + mul_24 + mul_23 = mul_24 = None + q_embed_2 = torch.cat([add_13, q_pass_2], dim=-1) + add_13 = q_pass_2 = None + mul_25 = k_rot_2 * cos_5 + cos_5 = None + x1_5 = k_rot_2[(Ellipsis, slice(None, 48, None))] + x2_5 = k_rot_2[(Ellipsis, slice(48, None, None))] + k_rot_2 = None + neg_5 = -x2_5 + x2_5 = None + cat_11 = torch.cat((neg_5, x1_5), dim=-1) + neg_5 = x1_5 = None + mul_26 = cat_11 * sin_5 + cat_11 = sin_5 = None + add_14 = mul_25 + mul_26 + mul_25 = mul_26 = None + k_embed_2 = torch.cat([add_14, k_pass_2], dim=-1) + add_14 = k_pass_2 = None + attention_mask_3 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_2 = q_embed_2.contiguous() + q_embed_2 = None + key_2 = k_embed_2.contiguous() + value_2 = value_states_5.contiguous() + attn_output_8 = torch._C._nn.scaled_dot_product_attention( + query_2, + key_2, + value_2, + attn_mask=attention_mask_3, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_2 = key_2 = value_2 = attention_mask_3 = None + transpose_12 = attn_output_8.transpose(1, 2) + attn_output_8 = None + attn_output_9 = transpose_12.contiguous() + transpose_12 = None + reshape_2 = attn_output_9.reshape(1, 2, -1) + attn_output_9 = None + attn_output_10 = reshape_2.contiguous() + reshape_2 = None + attn_output_11 = torch._C._nn.linear( + attn_output_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_10 = l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_4 = torch.nn.functional.dropout(attn_output_11, 0.0, False, False) + attn_output_11 = None + hidden_states_21 = hidden_states_17 + dropout_4 + hidden_states_17 = dropout_4 = None + hidden_states_22 = hidden_states_21.to(torch.float32) + pow_6 = hidden_states_22.pow(2) + variance_5 = pow_6.mean(-1, keepdim=True) + pow_6 = None + add_16 = variance_5 + 1e-05 + variance_5 = None + rsqrt_5 = torch.rsqrt(add_16) + add_16 = None + hidden_states_23 = hidden_states_22 * rsqrt_5 + hidden_states_22 = rsqrt_5 = None + to_15 = hidden_states_23.to(torch.bfloat16) + hidden_states_23 = None + hidden_states_24 = ( + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + * to_15 + ) + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = ( + to_15 + ) = None + up_states_6 = torch._C._nn.linear( + hidden_states_24, + l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_24 = l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_2 = up_states_6.chunk(2, dim=-1) + up_states_6 = None + gate_2 = chunk_2[0] + up_states_7 = chunk_2[1] + chunk_2 = None + silu_2 = torch.nn.functional.silu(gate_2, inplace=False) + gate_2 = None + up_states_8 = up_states_7 * silu_2 + up_states_7 = silu_2 = None + hidden_states_25 = torch._C._nn.linear( + up_states_8, + l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_8 = l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_5 = torch.nn.functional.dropout(hidden_states_25, 0.0, False, False) + hidden_states_25 = None + hidden_states_26 = hidden_states_21 + dropout_5 + hidden_states_21 = dropout_5 = None + hidden_states_27 = hidden_states_26.to(torch.float32) + pow_7 = hidden_states_27.pow(2) + variance_6 = pow_7.mean(-1, keepdim=True) + pow_7 = None + add_18 = variance_6 + 1e-05 + variance_6 = None + rsqrt_6 = torch.rsqrt(add_18) + add_18 = None + hidden_states_28 = hidden_states_27 * rsqrt_6 + hidden_states_27 = rsqrt_6 = None + to_17 = hidden_states_28.to(torch.bfloat16) + hidden_states_28 = None + hidden_states_29 = ( + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + * to_17 + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + to_17 + ) = None + qkv_3 = torch._C._nn.linear( + hidden_states_29, + l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_29 = l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_6 = qkv_3[(Ellipsis, slice(None, 3072, None))] + key_states_6 = qkv_3[(Ellipsis, slice(3072, 6144, None))] + value_states_6 = qkv_3[(Ellipsis, slice(6144, None, None))] + qkv_3 = None + view_10 = query_states_6.view((1, 2, -1, 96)) + query_states_6 = None + query_states_7 = view_10.transpose(1, 2) + view_10 = None + view_11 = key_states_6.view((1, 2, -1, 96)) + key_states_6 = None + key_states_7 = view_11.transpose(1, 2) + view_11 = None + view_12 = value_states_6.view((1, 2, -1, 96)) + value_states_6 = None + value_states_7 = view_12.transpose(1, 2) + view_12 = None + cos_6 = cos_2.unsqueeze(1) + sin_6 = sin_2.unsqueeze(1) + q_rot_3 = query_states_7[(Ellipsis, slice(None, 96, None))] + q_pass_3 = query_states_7[(Ellipsis, slice(96, None, None))] + query_states_7 = None + k_rot_3 = key_states_7[(Ellipsis, slice(None, 96, None))] + k_pass_3 = key_states_7[(Ellipsis, slice(96, None, None))] + key_states_7 = None + mul_32 = q_rot_3 * cos_6 + x1_6 = q_rot_3[(Ellipsis, slice(None, 48, None))] + x2_6 = q_rot_3[(Ellipsis, slice(48, None, None))] + q_rot_3 = None + neg_6 = -x2_6 + x2_6 = None + cat_13 = torch.cat((neg_6, x1_6), dim=-1) + neg_6 = x1_6 = None + mul_33 = cat_13 * sin_6 + cat_13 = None + add_19 = mul_32 + mul_33 + mul_32 = mul_33 = None + q_embed_3 = torch.cat([add_19, q_pass_3], dim=-1) + add_19 = q_pass_3 = None + mul_34 = k_rot_3 * cos_6 + cos_6 = None + x1_7 = k_rot_3[(Ellipsis, slice(None, 48, None))] + x2_7 = k_rot_3[(Ellipsis, slice(48, None, None))] + k_rot_3 = None + neg_7 = -x2_7 + x2_7 = None + cat_15 = torch.cat((neg_7, x1_7), dim=-1) + neg_7 = x1_7 = None + mul_35 = cat_15 * sin_6 + cat_15 = sin_6 = None + add_20 = mul_34 + mul_35 + mul_34 = mul_35 = None + k_embed_3 = torch.cat([add_20, k_pass_3], dim=-1) + add_20 = k_pass_3 = None + attention_mask_4 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_3 = q_embed_3.contiguous() + q_embed_3 = None + key_3 = k_embed_3.contiguous() + value_3 = value_states_7.contiguous() + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_3, + key_3, + value_3, + attn_mask=attention_mask_4, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_3 = key_3 = value_3 = attention_mask_4 = None + transpose_16 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_13 = transpose_16.contiguous() + transpose_16 = None + reshape_3 = attn_output_13.reshape(1, 2, -1) + attn_output_13 = None + attn_output_14 = reshape_3.contiguous() + reshape_3 = None + attn_output_15 = torch._C._nn.linear( + attn_output_14, + l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_14 = l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_6 = torch.nn.functional.dropout(attn_output_15, 0.0, False, False) + attn_output_15 = None + hidden_states_30 = hidden_states_26 + dropout_6 + hidden_states_26 = dropout_6 = None + hidden_states_31 = hidden_states_30.to(torch.float32) + pow_8 = hidden_states_31.pow(2) + variance_7 = pow_8.mean(-1, keepdim=True) + pow_8 = None + add_22 = variance_7 + 1e-05 + variance_7 = None + rsqrt_7 = torch.rsqrt(add_22) + add_22 = None + hidden_states_32 = hidden_states_31 * rsqrt_7 + hidden_states_31 = rsqrt_7 = None + to_19 = hidden_states_32.to(torch.bfloat16) + hidden_states_32 = None + hidden_states_33 = ( + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + * to_19 + ) + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = ( + to_19 + ) = None + up_states_9 = torch._C._nn.linear( + hidden_states_33, + l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_33 = l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_3 = up_states_9.chunk(2, dim=-1) + up_states_9 = None + gate_3 = chunk_3[0] + up_states_10 = chunk_3[1] + chunk_3 = None + silu_3 = torch.nn.functional.silu(gate_3, inplace=False) + gate_3 = None + up_states_11 = up_states_10 * silu_3 + up_states_10 = silu_3 = None + hidden_states_34 = torch._C._nn.linear( + up_states_11, + l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_11 = l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_7 = torch.nn.functional.dropout(hidden_states_34, 0.0, False, False) + hidden_states_34 = None + hidden_states_35 = hidden_states_30 + dropout_7 + hidden_states_30 = dropout_7 = None + hidden_states_36 = hidden_states_35.to(torch.float32) + pow_9 = hidden_states_36.pow(2) + variance_8 = pow_9.mean(-1, keepdim=True) + pow_9 = None + add_24 = variance_8 + 1e-05 + variance_8 = None + rsqrt_8 = torch.rsqrt(add_24) + add_24 = None + hidden_states_37 = hidden_states_36 * rsqrt_8 + hidden_states_36 = rsqrt_8 = None + to_21 = hidden_states_37.to(torch.bfloat16) + hidden_states_37 = None + hidden_states_38 = ( + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + * to_21 + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + to_21 + ) = None + qkv_4 = torch._C._nn.linear( + hidden_states_38, + l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_38 = l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_8 = qkv_4[(Ellipsis, slice(None, 3072, None))] + key_states_8 = qkv_4[(Ellipsis, slice(3072, 6144, None))] + value_states_8 = qkv_4[(Ellipsis, slice(6144, None, None))] + qkv_4 = None + view_13 = query_states_8.view((1, 2, -1, 96)) + query_states_8 = None + query_states_9 = view_13.transpose(1, 2) + view_13 = None + view_14 = key_states_8.view((1, 2, -1, 96)) + key_states_8 = None + key_states_9 = view_14.transpose(1, 2) + view_14 = None + view_15 = value_states_8.view((1, 2, -1, 96)) + value_states_8 = None + value_states_9 = view_15.transpose(1, 2) + view_15 = None + cos_7 = cos_2.unsqueeze(1) + sin_7 = sin_2.unsqueeze(1) + q_rot_4 = query_states_9[(Ellipsis, slice(None, 96, None))] + q_pass_4 = query_states_9[(Ellipsis, slice(96, None, None))] + query_states_9 = None + k_rot_4 = key_states_9[(Ellipsis, slice(None, 96, None))] + k_pass_4 = key_states_9[(Ellipsis, slice(96, None, None))] + key_states_9 = None + mul_41 = q_rot_4 * cos_7 + x1_8 = q_rot_4[(Ellipsis, slice(None, 48, None))] + x2_8 = q_rot_4[(Ellipsis, slice(48, None, None))] + q_rot_4 = None + neg_8 = -x2_8 + x2_8 = None + cat_17 = torch.cat((neg_8, x1_8), dim=-1) + neg_8 = x1_8 = None + mul_42 = cat_17 * sin_7 + cat_17 = None + add_25 = mul_41 + mul_42 + mul_41 = mul_42 = None + q_embed_4 = torch.cat([add_25, q_pass_4], dim=-1) + add_25 = q_pass_4 = None + mul_43 = k_rot_4 * cos_7 + cos_7 = None + x1_9 = k_rot_4[(Ellipsis, slice(None, 48, None))] + x2_9 = k_rot_4[(Ellipsis, slice(48, None, None))] + k_rot_4 = None + neg_9 = -x2_9 + x2_9 = None + cat_19 = torch.cat((neg_9, x1_9), dim=-1) + neg_9 = x1_9 = None + mul_44 = cat_19 * sin_7 + cat_19 = sin_7 = None + add_26 = mul_43 + mul_44 + mul_43 = mul_44 = None + k_embed_4 = torch.cat([add_26, k_pass_4], dim=-1) + add_26 = k_pass_4 = None + attention_mask_5 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_4 = q_embed_4.contiguous() + q_embed_4 = None + key_4 = k_embed_4.contiguous() + value_4 = value_states_9.contiguous() + attn_output_16 = torch._C._nn.scaled_dot_product_attention( + query_4, + key_4, + value_4, + attn_mask=attention_mask_5, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_4 = key_4 = value_4 = attention_mask_5 = None + transpose_20 = attn_output_16.transpose(1, 2) + attn_output_16 = None + attn_output_17 = transpose_20.contiguous() + transpose_20 = None + reshape_4 = attn_output_17.reshape(1, 2, -1) + attn_output_17 = None + attn_output_18 = reshape_4.contiguous() + reshape_4 = None + attn_output_19 = torch._C._nn.linear( + attn_output_18, + l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_18 = l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_8 = torch.nn.functional.dropout(attn_output_19, 0.0, False, False) + attn_output_19 = None + hidden_states_39 = hidden_states_35 + dropout_8 + hidden_states_35 = dropout_8 = None + hidden_states_40 = hidden_states_39.to(torch.float32) + pow_10 = hidden_states_40.pow(2) + variance_9 = pow_10.mean(-1, keepdim=True) + pow_10 = None + add_28 = variance_9 + 1e-05 + variance_9 = None + rsqrt_9 = torch.rsqrt(add_28) + add_28 = None + hidden_states_41 = hidden_states_40 * rsqrt_9 + hidden_states_40 = rsqrt_9 = None + to_23 = hidden_states_41.to(torch.bfloat16) + hidden_states_41 = None + hidden_states_42 = ( + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + * to_23 + ) + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = ( + to_23 + ) = None + up_states_12 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_42 = l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_4 = up_states_12.chunk(2, dim=-1) + up_states_12 = None + gate_4 = chunk_4[0] + up_states_13 = chunk_4[1] + chunk_4 = None + silu_4 = torch.nn.functional.silu(gate_4, inplace=False) + gate_4 = None + up_states_14 = up_states_13 * silu_4 + up_states_13 = silu_4 = None + hidden_states_43 = torch._C._nn.linear( + up_states_14, + l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_14 = l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_9 = torch.nn.functional.dropout(hidden_states_43, 0.0, False, False) + hidden_states_43 = None + hidden_states_44 = hidden_states_39 + dropout_9 + hidden_states_39 = dropout_9 = None + hidden_states_45 = hidden_states_44.to(torch.float32) + pow_11 = hidden_states_45.pow(2) + variance_10 = pow_11.mean(-1, keepdim=True) + pow_11 = None + add_30 = variance_10 + 1e-05 + variance_10 = None + rsqrt_10 = torch.rsqrt(add_30) + add_30 = None + hidden_states_46 = hidden_states_45 * rsqrt_10 + hidden_states_45 = rsqrt_10 = None + to_25 = hidden_states_46.to(torch.bfloat16) + hidden_states_46 = None + hidden_states_47 = ( + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + * to_25 + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + to_25 + ) = None + qkv_5 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_47 = l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_10 = qkv_5[(Ellipsis, slice(None, 3072, None))] + key_states_10 = qkv_5[(Ellipsis, slice(3072, 6144, None))] + value_states_10 = qkv_5[(Ellipsis, slice(6144, None, None))] + qkv_5 = None + view_16 = query_states_10.view((1, 2, -1, 96)) + query_states_10 = None + query_states_11 = view_16.transpose(1, 2) + view_16 = None + view_17 = key_states_10.view((1, 2, -1, 96)) + key_states_10 = None + key_states_11 = view_17.transpose(1, 2) + view_17 = None + view_18 = value_states_10.view((1, 2, -1, 96)) + value_states_10 = None + value_states_11 = view_18.transpose(1, 2) + view_18 = None + cos_8 = cos_2.unsqueeze(1) + sin_8 = sin_2.unsqueeze(1) + q_rot_5 = query_states_11[(Ellipsis, slice(None, 96, None))] + q_pass_5 = query_states_11[(Ellipsis, slice(96, None, None))] + query_states_11 = None + k_rot_5 = key_states_11[(Ellipsis, slice(None, 96, None))] + k_pass_5 = key_states_11[(Ellipsis, slice(96, None, None))] + key_states_11 = None + mul_50 = q_rot_5 * cos_8 + x1_10 = q_rot_5[(Ellipsis, slice(None, 48, None))] + x2_10 = q_rot_5[(Ellipsis, slice(48, None, None))] + q_rot_5 = None + neg_10 = -x2_10 + x2_10 = None + cat_21 = torch.cat((neg_10, x1_10), dim=-1) + neg_10 = x1_10 = None + mul_51 = cat_21 * sin_8 + cat_21 = None + add_31 = mul_50 + mul_51 + mul_50 = mul_51 = None + q_embed_5 = torch.cat([add_31, q_pass_5], dim=-1) + add_31 = q_pass_5 = None + mul_52 = k_rot_5 * cos_8 + cos_8 = None + x1_11 = k_rot_5[(Ellipsis, slice(None, 48, None))] + x2_11 = k_rot_5[(Ellipsis, slice(48, None, None))] + k_rot_5 = None + neg_11 = -x2_11 + x2_11 = None + cat_23 = torch.cat((neg_11, x1_11), dim=-1) + neg_11 = x1_11 = None + mul_53 = cat_23 * sin_8 + cat_23 = sin_8 = None + add_32 = mul_52 + mul_53 + mul_52 = mul_53 = None + k_embed_5 = torch.cat([add_32, k_pass_5], dim=-1) + add_32 = k_pass_5 = None + attention_mask_6 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_5 = q_embed_5.contiguous() + q_embed_5 = None + key_5 = k_embed_5.contiguous() + value_5 = value_states_11.contiguous() + attn_output_20 = torch._C._nn.scaled_dot_product_attention( + query_5, + key_5, + value_5, + attn_mask=attention_mask_6, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_5 = key_5 = value_5 = attention_mask_6 = None + transpose_24 = attn_output_20.transpose(1, 2) + attn_output_20 = None + attn_output_21 = transpose_24.contiguous() + transpose_24 = None + reshape_5 = attn_output_21.reshape(1, 2, -1) + attn_output_21 = None + attn_output_22 = reshape_5.contiguous() + reshape_5 = None + attn_output_23 = torch._C._nn.linear( + attn_output_22, + l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_22 = l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_10 = torch.nn.functional.dropout(attn_output_23, 0.0, False, False) + attn_output_23 = None + hidden_states_48 = hidden_states_44 + dropout_10 + hidden_states_44 = dropout_10 = None + hidden_states_49 = hidden_states_48.to(torch.float32) + pow_12 = hidden_states_49.pow(2) + variance_11 = pow_12.mean(-1, keepdim=True) + pow_12 = None + add_34 = variance_11 + 1e-05 + variance_11 = None + rsqrt_11 = torch.rsqrt(add_34) + add_34 = None + hidden_states_50 = hidden_states_49 * rsqrt_11 + hidden_states_49 = rsqrt_11 = None + to_27 = hidden_states_50.to(torch.bfloat16) + hidden_states_50 = None + hidden_states_51 = ( + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + * to_27 + ) + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = ( + to_27 + ) = None + up_states_15 = torch._C._nn.linear( + hidden_states_51, + l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_51 = l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_5 = up_states_15.chunk(2, dim=-1) + up_states_15 = None + gate_5 = chunk_5[0] + up_states_16 = chunk_5[1] + chunk_5 = None + silu_5 = torch.nn.functional.silu(gate_5, inplace=False) + gate_5 = None + up_states_17 = up_states_16 * silu_5 + up_states_16 = silu_5 = None + hidden_states_52 = torch._C._nn.linear( + up_states_17, + l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_17 = l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_11 = torch.nn.functional.dropout(hidden_states_52, 0.0, False, False) + hidden_states_52 = None + hidden_states_53 = hidden_states_48 + dropout_11 + hidden_states_48 = dropout_11 = None + hidden_states_54 = hidden_states_53.to(torch.float32) + pow_13 = hidden_states_54.pow(2) + variance_12 = pow_13.mean(-1, keepdim=True) + pow_13 = None + add_36 = variance_12 + 1e-05 + variance_12 = None + rsqrt_12 = torch.rsqrt(add_36) + add_36 = None + hidden_states_55 = hidden_states_54 * rsqrt_12 + hidden_states_54 = rsqrt_12 = None + to_29 = hidden_states_55.to(torch.bfloat16) + hidden_states_55 = None + hidden_states_56 = ( + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + * to_29 + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + to_29 + ) = None + qkv_6 = torch._C._nn.linear( + hidden_states_56, + l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_56 = l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_12 = qkv_6[(Ellipsis, slice(None, 3072, None))] + key_states_12 = qkv_6[(Ellipsis, slice(3072, 6144, None))] + value_states_12 = qkv_6[(Ellipsis, slice(6144, None, None))] + qkv_6 = None + view_19 = query_states_12.view((1, 2, -1, 96)) + query_states_12 = None + query_states_13 = view_19.transpose(1, 2) + view_19 = None + view_20 = key_states_12.view((1, 2, -1, 96)) + key_states_12 = None + key_states_13 = view_20.transpose(1, 2) + view_20 = None + view_21 = value_states_12.view((1, 2, -1, 96)) + value_states_12 = None + value_states_13 = view_21.transpose(1, 2) + view_21 = None + cos_9 = cos_2.unsqueeze(1) + sin_9 = sin_2.unsqueeze(1) + q_rot_6 = query_states_13[(Ellipsis, slice(None, 96, None))] + q_pass_6 = query_states_13[(Ellipsis, slice(96, None, None))] + query_states_13 = None + k_rot_6 = key_states_13[(Ellipsis, slice(None, 96, None))] + k_pass_6 = key_states_13[(Ellipsis, slice(96, None, None))] + key_states_13 = None + mul_59 = q_rot_6 * cos_9 + x1_12 = q_rot_6[(Ellipsis, slice(None, 48, None))] + x2_12 = q_rot_6[(Ellipsis, slice(48, None, None))] + q_rot_6 = None + neg_12 = -x2_12 + x2_12 = None + cat_25 = torch.cat((neg_12, x1_12), dim=-1) + neg_12 = x1_12 = None + mul_60 = cat_25 * sin_9 + cat_25 = None + add_37 = mul_59 + mul_60 + mul_59 = mul_60 = None + q_embed_6 = torch.cat([add_37, q_pass_6], dim=-1) + add_37 = q_pass_6 = None + mul_61 = k_rot_6 * cos_9 + cos_9 = None + x1_13 = k_rot_6[(Ellipsis, slice(None, 48, None))] + x2_13 = k_rot_6[(Ellipsis, slice(48, None, None))] + k_rot_6 = None + neg_13 = -x2_13 + x2_13 = None + cat_27 = torch.cat((neg_13, x1_13), dim=-1) + neg_13 = x1_13 = None + mul_62 = cat_27 * sin_9 + cat_27 = sin_9 = None + add_38 = mul_61 + mul_62 + mul_61 = mul_62 = None + k_embed_6 = torch.cat([add_38, k_pass_6], dim=-1) + add_38 = k_pass_6 = None + attention_mask_7 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_6 = q_embed_6.contiguous() + q_embed_6 = None + key_6 = k_embed_6.contiguous() + value_6 = value_states_13.contiguous() + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + query_6, + key_6, + value_6, + attn_mask=attention_mask_7, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_6 = key_6 = value_6 = attention_mask_7 = None + transpose_28 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_25 = transpose_28.contiguous() + transpose_28 = None + reshape_6 = attn_output_25.reshape(1, 2, -1) + attn_output_25 = None + attn_output_26 = reshape_6.contiguous() + reshape_6 = None + attn_output_27 = torch._C._nn.linear( + attn_output_26, + l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_26 = l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_12 = torch.nn.functional.dropout(attn_output_27, 0.0, False, False) + attn_output_27 = None + hidden_states_57 = hidden_states_53 + dropout_12 + hidden_states_53 = dropout_12 = None + hidden_states_58 = hidden_states_57.to(torch.float32) + pow_14 = hidden_states_58.pow(2) + variance_13 = pow_14.mean(-1, keepdim=True) + pow_14 = None + add_40 = variance_13 + 1e-05 + variance_13 = None + rsqrt_13 = torch.rsqrt(add_40) + add_40 = None + hidden_states_59 = hidden_states_58 * rsqrt_13 + hidden_states_58 = rsqrt_13 = None + to_31 = hidden_states_59.to(torch.bfloat16) + hidden_states_59 = None + hidden_states_60 = ( + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + * to_31 + ) + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = ( + to_31 + ) = None + up_states_18 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_60 = l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_6 = up_states_18.chunk(2, dim=-1) + up_states_18 = None + gate_6 = chunk_6[0] + up_states_19 = chunk_6[1] + chunk_6 = None + silu_6 = torch.nn.functional.silu(gate_6, inplace=False) + gate_6 = None + up_states_20 = up_states_19 * silu_6 + up_states_19 = silu_6 = None + hidden_states_61 = torch._C._nn.linear( + up_states_20, + l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_20 = l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_13 = torch.nn.functional.dropout(hidden_states_61, 0.0, False, False) + hidden_states_61 = None + hidden_states_62 = hidden_states_57 + dropout_13 + hidden_states_57 = dropout_13 = None + hidden_states_63 = hidden_states_62.to(torch.float32) + pow_15 = hidden_states_63.pow(2) + variance_14 = pow_15.mean(-1, keepdim=True) + pow_15 = None + add_42 = variance_14 + 1e-05 + variance_14 = None + rsqrt_14 = torch.rsqrt(add_42) + add_42 = None + hidden_states_64 = hidden_states_63 * rsqrt_14 + hidden_states_63 = rsqrt_14 = None + to_33 = hidden_states_64.to(torch.bfloat16) + hidden_states_64 = None + hidden_states_65 = ( + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + * to_33 + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + to_33 + ) = None + qkv_7 = torch._C._nn.linear( + hidden_states_65, + l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_65 = l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_14 = qkv_7[(Ellipsis, slice(None, 3072, None))] + key_states_14 = qkv_7[(Ellipsis, slice(3072, 6144, None))] + value_states_14 = qkv_7[(Ellipsis, slice(6144, None, None))] + qkv_7 = None + view_22 = query_states_14.view((1, 2, -1, 96)) + query_states_14 = None + query_states_15 = view_22.transpose(1, 2) + view_22 = None + view_23 = key_states_14.view((1, 2, -1, 96)) + key_states_14 = None + key_states_15 = view_23.transpose(1, 2) + view_23 = None + view_24 = value_states_14.view((1, 2, -1, 96)) + value_states_14 = None + value_states_15 = view_24.transpose(1, 2) + view_24 = None + cos_10 = cos_2.unsqueeze(1) + sin_10 = sin_2.unsqueeze(1) + q_rot_7 = query_states_15[(Ellipsis, slice(None, 96, None))] + q_pass_7 = query_states_15[(Ellipsis, slice(96, None, None))] + query_states_15 = None + k_rot_7 = key_states_15[(Ellipsis, slice(None, 96, None))] + k_pass_7 = key_states_15[(Ellipsis, slice(96, None, None))] + key_states_15 = None + mul_68 = q_rot_7 * cos_10 + x1_14 = q_rot_7[(Ellipsis, slice(None, 48, None))] + x2_14 = q_rot_7[(Ellipsis, slice(48, None, None))] + q_rot_7 = None + neg_14 = -x2_14 + x2_14 = None + cat_29 = torch.cat((neg_14, x1_14), dim=-1) + neg_14 = x1_14 = None + mul_69 = cat_29 * sin_10 + cat_29 = None + add_43 = mul_68 + mul_69 + mul_68 = mul_69 = None + q_embed_7 = torch.cat([add_43, q_pass_7], dim=-1) + add_43 = q_pass_7 = None + mul_70 = k_rot_7 * cos_10 + cos_10 = None + x1_15 = k_rot_7[(Ellipsis, slice(None, 48, None))] + x2_15 = k_rot_7[(Ellipsis, slice(48, None, None))] + k_rot_7 = None + neg_15 = -x2_15 + x2_15 = None + cat_31 = torch.cat((neg_15, x1_15), dim=-1) + neg_15 = x1_15 = None + mul_71 = cat_31 * sin_10 + cat_31 = sin_10 = None + add_44 = mul_70 + mul_71 + mul_70 = mul_71 = None + k_embed_7 = torch.cat([add_44, k_pass_7], dim=-1) + add_44 = k_pass_7 = None + attention_mask_8 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_7 = q_embed_7.contiguous() + q_embed_7 = None + key_7 = k_embed_7.contiguous() + value_7 = value_states_15.contiguous() + attn_output_28 = torch._C._nn.scaled_dot_product_attention( + query_7, + key_7, + value_7, + attn_mask=attention_mask_8, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_7 = key_7 = value_7 = attention_mask_8 = None + transpose_32 = attn_output_28.transpose(1, 2) + attn_output_28 = None + attn_output_29 = transpose_32.contiguous() + transpose_32 = None + reshape_7 = attn_output_29.reshape(1, 2, -1) + attn_output_29 = None + attn_output_30 = reshape_7.contiguous() + reshape_7 = None + attn_output_31 = torch._C._nn.linear( + attn_output_30, + l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_30 = l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_14 = torch.nn.functional.dropout(attn_output_31, 0.0, False, False) + attn_output_31 = None + hidden_states_66 = hidden_states_62 + dropout_14 + hidden_states_62 = dropout_14 = None + hidden_states_67 = hidden_states_66.to(torch.float32) + pow_16 = hidden_states_67.pow(2) + variance_15 = pow_16.mean(-1, keepdim=True) + pow_16 = None + add_46 = variance_15 + 1e-05 + variance_15 = None + rsqrt_15 = torch.rsqrt(add_46) + add_46 = None + hidden_states_68 = hidden_states_67 * rsqrt_15 + hidden_states_67 = rsqrt_15 = None + to_35 = hidden_states_68.to(torch.bfloat16) + hidden_states_68 = None + hidden_states_69 = ( + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + * to_35 + ) + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = ( + to_35 + ) = None + up_states_21 = torch._C._nn.linear( + hidden_states_69, + l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_69 = l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_7 = up_states_21.chunk(2, dim=-1) + up_states_21 = None + gate_7 = chunk_7[0] + up_states_22 = chunk_7[1] + chunk_7 = None + silu_7 = torch.nn.functional.silu(gate_7, inplace=False) + gate_7 = None + up_states_23 = up_states_22 * silu_7 + up_states_22 = silu_7 = None + hidden_states_70 = torch._C._nn.linear( + up_states_23, + l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_23 = l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_15 = torch.nn.functional.dropout(hidden_states_70, 0.0, False, False) + hidden_states_70 = None + hidden_states_71 = hidden_states_66 + dropout_15 + hidden_states_66 = dropout_15 = None + hidden_states_72 = hidden_states_71.to(torch.float32) + pow_17 = hidden_states_72.pow(2) + variance_16 = pow_17.mean(-1, keepdim=True) + pow_17 = None + add_48 = variance_16 + 1e-05 + variance_16 = None + rsqrt_16 = torch.rsqrt(add_48) + add_48 = None + hidden_states_73 = hidden_states_72 * rsqrt_16 + hidden_states_72 = rsqrt_16 = None + to_37 = hidden_states_73.to(torch.bfloat16) + hidden_states_73 = None + hidden_states_74 = ( + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + * to_37 + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + to_37 + ) = None + qkv_8 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_74 = l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_16 = qkv_8[(Ellipsis, slice(None, 3072, None))] + key_states_16 = qkv_8[(Ellipsis, slice(3072, 6144, None))] + value_states_16 = qkv_8[(Ellipsis, slice(6144, None, None))] + qkv_8 = None + view_25 = query_states_16.view((1, 2, -1, 96)) + query_states_16 = None + query_states_17 = view_25.transpose(1, 2) + view_25 = None + view_26 = key_states_16.view((1, 2, -1, 96)) + key_states_16 = None + key_states_17 = view_26.transpose(1, 2) + view_26 = None + view_27 = value_states_16.view((1, 2, -1, 96)) + value_states_16 = None + value_states_17 = view_27.transpose(1, 2) + view_27 = None + cos_11 = cos_2.unsqueeze(1) + sin_11 = sin_2.unsqueeze(1) + q_rot_8 = query_states_17[(Ellipsis, slice(None, 96, None))] + q_pass_8 = query_states_17[(Ellipsis, slice(96, None, None))] + query_states_17 = None + k_rot_8 = key_states_17[(Ellipsis, slice(None, 96, None))] + k_pass_8 = key_states_17[(Ellipsis, slice(96, None, None))] + key_states_17 = None + mul_77 = q_rot_8 * cos_11 + x1_16 = q_rot_8[(Ellipsis, slice(None, 48, None))] + x2_16 = q_rot_8[(Ellipsis, slice(48, None, None))] + q_rot_8 = None + neg_16 = -x2_16 + x2_16 = None + cat_33 = torch.cat((neg_16, x1_16), dim=-1) + neg_16 = x1_16 = None + mul_78 = cat_33 * sin_11 + cat_33 = None + add_49 = mul_77 + mul_78 + mul_77 = mul_78 = None + q_embed_8 = torch.cat([add_49, q_pass_8], dim=-1) + add_49 = q_pass_8 = None + mul_79 = k_rot_8 * cos_11 + cos_11 = None + x1_17 = k_rot_8[(Ellipsis, slice(None, 48, None))] + x2_17 = k_rot_8[(Ellipsis, slice(48, None, None))] + k_rot_8 = None + neg_17 = -x2_17 + x2_17 = None + cat_35 = torch.cat((neg_17, x1_17), dim=-1) + neg_17 = x1_17 = None + mul_80 = cat_35 * sin_11 + cat_35 = sin_11 = None + add_50 = mul_79 + mul_80 + mul_79 = mul_80 = None + k_embed_8 = torch.cat([add_50, k_pass_8], dim=-1) + add_50 = k_pass_8 = None + attention_mask_9 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_8 = q_embed_8.contiguous() + q_embed_8 = None + key_8 = k_embed_8.contiguous() + value_8 = value_states_17.contiguous() + attn_output_32 = torch._C._nn.scaled_dot_product_attention( + query_8, + key_8, + value_8, + attn_mask=attention_mask_9, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_8 = key_8 = value_8 = attention_mask_9 = None + transpose_36 = attn_output_32.transpose(1, 2) + attn_output_32 = None + attn_output_33 = transpose_36.contiguous() + transpose_36 = None + reshape_8 = attn_output_33.reshape(1, 2, -1) + attn_output_33 = None + attn_output_34 = reshape_8.contiguous() + reshape_8 = None + attn_output_35 = torch._C._nn.linear( + attn_output_34, + l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_34 = l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_16 = torch.nn.functional.dropout(attn_output_35, 0.0, False, False) + attn_output_35 = None + hidden_states_75 = hidden_states_71 + dropout_16 + hidden_states_71 = dropout_16 = None + hidden_states_76 = hidden_states_75.to(torch.float32) + pow_18 = hidden_states_76.pow(2) + variance_17 = pow_18.mean(-1, keepdim=True) + pow_18 = None + add_52 = variance_17 + 1e-05 + variance_17 = None + rsqrt_17 = torch.rsqrt(add_52) + add_52 = None + hidden_states_77 = hidden_states_76 * rsqrt_17 + hidden_states_76 = rsqrt_17 = None + to_39 = hidden_states_77.to(torch.bfloat16) + hidden_states_77 = None + hidden_states_78 = ( + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + * to_39 + ) + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = ( + to_39 + ) = None + up_states_24 = torch._C._nn.linear( + hidden_states_78, + l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_78 = l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_8 = up_states_24.chunk(2, dim=-1) + up_states_24 = None + gate_8 = chunk_8[0] + up_states_25 = chunk_8[1] + chunk_8 = None + silu_8 = torch.nn.functional.silu(gate_8, inplace=False) + gate_8 = None + up_states_26 = up_states_25 * silu_8 + up_states_25 = silu_8 = None + hidden_states_79 = torch._C._nn.linear( + up_states_26, + l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_26 = l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_17 = torch.nn.functional.dropout(hidden_states_79, 0.0, False, False) + hidden_states_79 = None + hidden_states_80 = hidden_states_75 + dropout_17 + hidden_states_75 = dropout_17 = None + hidden_states_81 = hidden_states_80.to(torch.float32) + pow_19 = hidden_states_81.pow(2) + variance_18 = pow_19.mean(-1, keepdim=True) + pow_19 = None + add_54 = variance_18 + 1e-05 + variance_18 = None + rsqrt_18 = torch.rsqrt(add_54) + add_54 = None + hidden_states_82 = hidden_states_81 * rsqrt_18 + hidden_states_81 = rsqrt_18 = None + to_41 = hidden_states_82.to(torch.bfloat16) + hidden_states_82 = None + hidden_states_83 = ( + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + * to_41 + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + to_41 + ) = None + qkv_9 = torch._C._nn.linear( + hidden_states_83, + l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_83 = l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_18 = qkv_9[(Ellipsis, slice(None, 3072, None))] + key_states_18 = qkv_9[(Ellipsis, slice(3072, 6144, None))] + value_states_18 = qkv_9[(Ellipsis, slice(6144, None, None))] + qkv_9 = None + view_28 = query_states_18.view((1, 2, -1, 96)) + query_states_18 = None + query_states_19 = view_28.transpose(1, 2) + view_28 = None + view_29 = key_states_18.view((1, 2, -1, 96)) + key_states_18 = None + key_states_19 = view_29.transpose(1, 2) + view_29 = None + view_30 = value_states_18.view((1, 2, -1, 96)) + value_states_18 = None + value_states_19 = view_30.transpose(1, 2) + view_30 = None + cos_12 = cos_2.unsqueeze(1) + sin_12 = sin_2.unsqueeze(1) + q_rot_9 = query_states_19[(Ellipsis, slice(None, 96, None))] + q_pass_9 = query_states_19[(Ellipsis, slice(96, None, None))] + query_states_19 = None + k_rot_9 = key_states_19[(Ellipsis, slice(None, 96, None))] + k_pass_9 = key_states_19[(Ellipsis, slice(96, None, None))] + key_states_19 = None + mul_86 = q_rot_9 * cos_12 + x1_18 = q_rot_9[(Ellipsis, slice(None, 48, None))] + x2_18 = q_rot_9[(Ellipsis, slice(48, None, None))] + q_rot_9 = None + neg_18 = -x2_18 + x2_18 = None + cat_37 = torch.cat((neg_18, x1_18), dim=-1) + neg_18 = x1_18 = None + mul_87 = cat_37 * sin_12 + cat_37 = None + add_55 = mul_86 + mul_87 + mul_86 = mul_87 = None + q_embed_9 = torch.cat([add_55, q_pass_9], dim=-1) + add_55 = q_pass_9 = None + mul_88 = k_rot_9 * cos_12 + cos_12 = None + x1_19 = k_rot_9[(Ellipsis, slice(None, 48, None))] + x2_19 = k_rot_9[(Ellipsis, slice(48, None, None))] + k_rot_9 = None + neg_19 = -x2_19 + x2_19 = None + cat_39 = torch.cat((neg_19, x1_19), dim=-1) + neg_19 = x1_19 = None + mul_89 = cat_39 * sin_12 + cat_39 = sin_12 = None + add_56 = mul_88 + mul_89 + mul_88 = mul_89 = None + k_embed_9 = torch.cat([add_56, k_pass_9], dim=-1) + add_56 = k_pass_9 = None + attention_mask_10 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_9 = q_embed_9.contiguous() + q_embed_9 = None + key_9 = k_embed_9.contiguous() + value_9 = value_states_19.contiguous() + attn_output_36 = torch._C._nn.scaled_dot_product_attention( + query_9, + key_9, + value_9, + attn_mask=attention_mask_10, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_9 = key_9 = value_9 = attention_mask_10 = None + transpose_40 = attn_output_36.transpose(1, 2) + attn_output_36 = None + attn_output_37 = transpose_40.contiguous() + transpose_40 = None + reshape_9 = attn_output_37.reshape(1, 2, -1) + attn_output_37 = None + attn_output_38 = reshape_9.contiguous() + reshape_9 = None + attn_output_39 = torch._C._nn.linear( + attn_output_38, + l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_38 = l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_18 = torch.nn.functional.dropout(attn_output_39, 0.0, False, False) + attn_output_39 = None + hidden_states_84 = hidden_states_80 + dropout_18 + hidden_states_80 = dropout_18 = None + hidden_states_85 = hidden_states_84.to(torch.float32) + pow_20 = hidden_states_85.pow(2) + variance_19 = pow_20.mean(-1, keepdim=True) + pow_20 = None + add_58 = variance_19 + 1e-05 + variance_19 = None + rsqrt_19 = torch.rsqrt(add_58) + add_58 = None + hidden_states_86 = hidden_states_85 * rsqrt_19 + hidden_states_85 = rsqrt_19 = None + to_43 = hidden_states_86.to(torch.bfloat16) + hidden_states_86 = None + hidden_states_87 = ( + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + * to_43 + ) + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = ( + to_43 + ) = None + up_states_27 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_87 = l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_9 = up_states_27.chunk(2, dim=-1) + up_states_27 = None + gate_9 = chunk_9[0] + up_states_28 = chunk_9[1] + chunk_9 = None + silu_9 = torch.nn.functional.silu(gate_9, inplace=False) + gate_9 = None + up_states_29 = up_states_28 * silu_9 + up_states_28 = silu_9 = None + hidden_states_88 = torch._C._nn.linear( + up_states_29, + l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_29 = l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_19 = torch.nn.functional.dropout(hidden_states_88, 0.0, False, False) + hidden_states_88 = None + hidden_states_89 = hidden_states_84 + dropout_19 + hidden_states_84 = dropout_19 = None + hidden_states_90 = hidden_states_89.to(torch.float32) + pow_21 = hidden_states_90.pow(2) + variance_20 = pow_21.mean(-1, keepdim=True) + pow_21 = None + add_60 = variance_20 + 1e-05 + variance_20 = None + rsqrt_20 = torch.rsqrt(add_60) + add_60 = None + hidden_states_91 = hidden_states_90 * rsqrt_20 + hidden_states_90 = rsqrt_20 = None + to_45 = hidden_states_91.to(torch.bfloat16) + hidden_states_91 = None + hidden_states_92 = ( + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + * to_45 + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + to_45 + ) = None + qkv_10 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_92 = l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_20 = qkv_10[(Ellipsis, slice(None, 3072, None))] + key_states_20 = qkv_10[(Ellipsis, slice(3072, 6144, None))] + value_states_20 = qkv_10[(Ellipsis, slice(6144, None, None))] + qkv_10 = None + view_31 = query_states_20.view((1, 2, -1, 96)) + query_states_20 = None + query_states_21 = view_31.transpose(1, 2) + view_31 = None + view_32 = key_states_20.view((1, 2, -1, 96)) + key_states_20 = None + key_states_21 = view_32.transpose(1, 2) + view_32 = None + view_33 = value_states_20.view((1, 2, -1, 96)) + value_states_20 = None + value_states_21 = view_33.transpose(1, 2) + view_33 = None + cos_13 = cos_2.unsqueeze(1) + sin_13 = sin_2.unsqueeze(1) + q_rot_10 = query_states_21[(Ellipsis, slice(None, 96, None))] + q_pass_10 = query_states_21[(Ellipsis, slice(96, None, None))] + query_states_21 = None + k_rot_10 = key_states_21[(Ellipsis, slice(None, 96, None))] + k_pass_10 = key_states_21[(Ellipsis, slice(96, None, None))] + key_states_21 = None + mul_95 = q_rot_10 * cos_13 + x1_20 = q_rot_10[(Ellipsis, slice(None, 48, None))] + x2_20 = q_rot_10[(Ellipsis, slice(48, None, None))] + q_rot_10 = None + neg_20 = -x2_20 + x2_20 = None + cat_41 = torch.cat((neg_20, x1_20), dim=-1) + neg_20 = x1_20 = None + mul_96 = cat_41 * sin_13 + cat_41 = None + add_61 = mul_95 + mul_96 + mul_95 = mul_96 = None + q_embed_10 = torch.cat([add_61, q_pass_10], dim=-1) + add_61 = q_pass_10 = None + mul_97 = k_rot_10 * cos_13 + cos_13 = None + x1_21 = k_rot_10[(Ellipsis, slice(None, 48, None))] + x2_21 = k_rot_10[(Ellipsis, slice(48, None, None))] + k_rot_10 = None + neg_21 = -x2_21 + x2_21 = None + cat_43 = torch.cat((neg_21, x1_21), dim=-1) + neg_21 = x1_21 = None + mul_98 = cat_43 * sin_13 + cat_43 = sin_13 = None + add_62 = mul_97 + mul_98 + mul_97 = mul_98 = None + k_embed_10 = torch.cat([add_62, k_pass_10], dim=-1) + add_62 = k_pass_10 = None + attention_mask_11 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_10 = q_embed_10.contiguous() + q_embed_10 = None + key_10 = k_embed_10.contiguous() + value_10 = value_states_21.contiguous() + attn_output_40 = torch._C._nn.scaled_dot_product_attention( + query_10, + key_10, + value_10, + attn_mask=attention_mask_11, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_10 = key_10 = value_10 = attention_mask_11 = None + transpose_44 = attn_output_40.transpose(1, 2) + attn_output_40 = None + attn_output_41 = transpose_44.contiguous() + transpose_44 = None + reshape_10 = attn_output_41.reshape(1, 2, -1) + attn_output_41 = None + attn_output_42 = reshape_10.contiguous() + reshape_10 = None + attn_output_43 = torch._C._nn.linear( + attn_output_42, + l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_42 = l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_20 = torch.nn.functional.dropout(attn_output_43, 0.0, False, False) + attn_output_43 = None + hidden_states_93 = hidden_states_89 + dropout_20 + hidden_states_89 = dropout_20 = None + hidden_states_94 = hidden_states_93.to(torch.float32) + pow_22 = hidden_states_94.pow(2) + variance_21 = pow_22.mean(-1, keepdim=True) + pow_22 = None + add_64 = variance_21 + 1e-05 + variance_21 = None + rsqrt_21 = torch.rsqrt(add_64) + add_64 = None + hidden_states_95 = hidden_states_94 * rsqrt_21 + hidden_states_94 = rsqrt_21 = None + to_47 = hidden_states_95.to(torch.bfloat16) + hidden_states_95 = None + hidden_states_96 = ( + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + * to_47 + ) + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = ( + to_47 + ) = None + up_states_30 = torch._C._nn.linear( + hidden_states_96, + l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_96 = l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_10 = up_states_30.chunk(2, dim=-1) + up_states_30 = None + gate_10 = chunk_10[0] + up_states_31 = chunk_10[1] + chunk_10 = None + silu_10 = torch.nn.functional.silu(gate_10, inplace=False) + gate_10 = None + up_states_32 = up_states_31 * silu_10 + up_states_31 = silu_10 = None + hidden_states_97 = torch._C._nn.linear( + up_states_32, + l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_32 = l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_21 = torch.nn.functional.dropout(hidden_states_97, 0.0, False, False) + hidden_states_97 = None + hidden_states_98 = hidden_states_93 + dropout_21 + hidden_states_93 = dropout_21 = None + hidden_states_99 = hidden_states_98.to(torch.float32) + pow_23 = hidden_states_99.pow(2) + variance_22 = pow_23.mean(-1, keepdim=True) + pow_23 = None + add_66 = variance_22 + 1e-05 + variance_22 = None + rsqrt_22 = torch.rsqrt(add_66) + add_66 = None + hidden_states_100 = hidden_states_99 * rsqrt_22 + hidden_states_99 = rsqrt_22 = None + to_49 = hidden_states_100.to(torch.bfloat16) + hidden_states_100 = None + hidden_states_101 = ( + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + * to_49 + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + to_49 + ) = None + qkv_11 = torch._C._nn.linear( + hidden_states_101, + l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_101 = l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_22 = qkv_11[(Ellipsis, slice(None, 3072, None))] + key_states_22 = qkv_11[(Ellipsis, slice(3072, 6144, None))] + value_states_22 = qkv_11[(Ellipsis, slice(6144, None, None))] + qkv_11 = None + view_34 = query_states_22.view((1, 2, -1, 96)) + query_states_22 = None + query_states_23 = view_34.transpose(1, 2) + view_34 = None + view_35 = key_states_22.view((1, 2, -1, 96)) + key_states_22 = None + key_states_23 = view_35.transpose(1, 2) + view_35 = None + view_36 = value_states_22.view((1, 2, -1, 96)) + value_states_22 = None + value_states_23 = view_36.transpose(1, 2) + view_36 = None + cos_14 = cos_2.unsqueeze(1) + sin_14 = sin_2.unsqueeze(1) + q_rot_11 = query_states_23[(Ellipsis, slice(None, 96, None))] + q_pass_11 = query_states_23[(Ellipsis, slice(96, None, None))] + query_states_23 = None + k_rot_11 = key_states_23[(Ellipsis, slice(None, 96, None))] + k_pass_11 = key_states_23[(Ellipsis, slice(96, None, None))] + key_states_23 = None + mul_104 = q_rot_11 * cos_14 + x1_22 = q_rot_11[(Ellipsis, slice(None, 48, None))] + x2_22 = q_rot_11[(Ellipsis, slice(48, None, None))] + q_rot_11 = None + neg_22 = -x2_22 + x2_22 = None + cat_45 = torch.cat((neg_22, x1_22), dim=-1) + neg_22 = x1_22 = None + mul_105 = cat_45 * sin_14 + cat_45 = None + add_67 = mul_104 + mul_105 + mul_104 = mul_105 = None + q_embed_11 = torch.cat([add_67, q_pass_11], dim=-1) + add_67 = q_pass_11 = None + mul_106 = k_rot_11 * cos_14 + cos_14 = None + x1_23 = k_rot_11[(Ellipsis, slice(None, 48, None))] + x2_23 = k_rot_11[(Ellipsis, slice(48, None, None))] + k_rot_11 = None + neg_23 = -x2_23 + x2_23 = None + cat_47 = torch.cat((neg_23, x1_23), dim=-1) + neg_23 = x1_23 = None + mul_107 = cat_47 * sin_14 + cat_47 = sin_14 = None + add_68 = mul_106 + mul_107 + mul_106 = mul_107 = None + k_embed_11 = torch.cat([add_68, k_pass_11], dim=-1) + add_68 = k_pass_11 = None + attention_mask_12 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_11 = q_embed_11.contiguous() + q_embed_11 = None + key_11 = k_embed_11.contiguous() + value_11 = value_states_23.contiguous() + attn_output_44 = torch._C._nn.scaled_dot_product_attention( + query_11, + key_11, + value_11, + attn_mask=attention_mask_12, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_11 = key_11 = value_11 = attention_mask_12 = None + transpose_48 = attn_output_44.transpose(1, 2) + attn_output_44 = None + attn_output_45 = transpose_48.contiguous() + transpose_48 = None + reshape_11 = attn_output_45.reshape(1, 2, -1) + attn_output_45 = None + attn_output_46 = reshape_11.contiguous() + reshape_11 = None + attn_output_47 = torch._C._nn.linear( + attn_output_46, + l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_46 = l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_22 = torch.nn.functional.dropout(attn_output_47, 0.0, False, False) + attn_output_47 = None + hidden_states_102 = hidden_states_98 + dropout_22 + hidden_states_98 = dropout_22 = None + hidden_states_103 = hidden_states_102.to(torch.float32) + pow_24 = hidden_states_103.pow(2) + variance_23 = pow_24.mean(-1, keepdim=True) + pow_24 = None + add_70 = variance_23 + 1e-05 + variance_23 = None + rsqrt_23 = torch.rsqrt(add_70) + add_70 = None + hidden_states_104 = hidden_states_103 * rsqrt_23 + hidden_states_103 = rsqrt_23 = None + to_51 = hidden_states_104.to(torch.bfloat16) + hidden_states_104 = None + hidden_states_105 = ( + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + * to_51 + ) + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = ( + to_51 + ) = None + up_states_33 = torch._C._nn.linear( + hidden_states_105, + l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_105 = l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_11 = up_states_33.chunk(2, dim=-1) + up_states_33 = None + gate_11 = chunk_11[0] + up_states_34 = chunk_11[1] + chunk_11 = None + silu_11 = torch.nn.functional.silu(gate_11, inplace=False) + gate_11 = None + up_states_35 = up_states_34 * silu_11 + up_states_34 = silu_11 = None + hidden_states_106 = torch._C._nn.linear( + up_states_35, + l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_35 = l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_23 = torch.nn.functional.dropout(hidden_states_106, 0.0, False, False) + hidden_states_106 = None + hidden_states_107 = hidden_states_102 + dropout_23 + hidden_states_102 = dropout_23 = None + hidden_states_108 = hidden_states_107.to(torch.float32) + pow_25 = hidden_states_108.pow(2) + variance_24 = pow_25.mean(-1, keepdim=True) + pow_25 = None + add_72 = variance_24 + 1e-05 + variance_24 = None + rsqrt_24 = torch.rsqrt(add_72) + add_72 = None + hidden_states_109 = hidden_states_108 * rsqrt_24 + hidden_states_108 = rsqrt_24 = None + to_53 = hidden_states_109.to(torch.bfloat16) + hidden_states_109 = None + hidden_states_110 = ( + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + * to_53 + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + to_53 + ) = None + qkv_12 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_110 = l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_24 = qkv_12[(Ellipsis, slice(None, 3072, None))] + key_states_24 = qkv_12[(Ellipsis, slice(3072, 6144, None))] + value_states_24 = qkv_12[(Ellipsis, slice(6144, None, None))] + qkv_12 = None + view_37 = query_states_24.view((1, 2, -1, 96)) + query_states_24 = None + query_states_25 = view_37.transpose(1, 2) + view_37 = None + view_38 = key_states_24.view((1, 2, -1, 96)) + key_states_24 = None + key_states_25 = view_38.transpose(1, 2) + view_38 = None + view_39 = value_states_24.view((1, 2, -1, 96)) + value_states_24 = None + value_states_25 = view_39.transpose(1, 2) + view_39 = None + cos_15 = cos_2.unsqueeze(1) + sin_15 = sin_2.unsqueeze(1) + q_rot_12 = query_states_25[(Ellipsis, slice(None, 96, None))] + q_pass_12 = query_states_25[(Ellipsis, slice(96, None, None))] + query_states_25 = None + k_rot_12 = key_states_25[(Ellipsis, slice(None, 96, None))] + k_pass_12 = key_states_25[(Ellipsis, slice(96, None, None))] + key_states_25 = None + mul_113 = q_rot_12 * cos_15 + x1_24 = q_rot_12[(Ellipsis, slice(None, 48, None))] + x2_24 = q_rot_12[(Ellipsis, slice(48, None, None))] + q_rot_12 = None + neg_24 = -x2_24 + x2_24 = None + cat_49 = torch.cat((neg_24, x1_24), dim=-1) + neg_24 = x1_24 = None + mul_114 = cat_49 * sin_15 + cat_49 = None + add_73 = mul_113 + mul_114 + mul_113 = mul_114 = None + q_embed_12 = torch.cat([add_73, q_pass_12], dim=-1) + add_73 = q_pass_12 = None + mul_115 = k_rot_12 * cos_15 + cos_15 = None + x1_25 = k_rot_12[(Ellipsis, slice(None, 48, None))] + x2_25 = k_rot_12[(Ellipsis, slice(48, None, None))] + k_rot_12 = None + neg_25 = -x2_25 + x2_25 = None + cat_51 = torch.cat((neg_25, x1_25), dim=-1) + neg_25 = x1_25 = None + mul_116 = cat_51 * sin_15 + cat_51 = sin_15 = None + add_74 = mul_115 + mul_116 + mul_115 = mul_116 = None + k_embed_12 = torch.cat([add_74, k_pass_12], dim=-1) + add_74 = k_pass_12 = None + attention_mask_13 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_12 = q_embed_12.contiguous() + q_embed_12 = None + key_12 = k_embed_12.contiguous() + value_12 = value_states_25.contiguous() + attn_output_48 = torch._C._nn.scaled_dot_product_attention( + query_12, + key_12, + value_12, + attn_mask=attention_mask_13, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_12 = key_12 = value_12 = attention_mask_13 = None + transpose_52 = attn_output_48.transpose(1, 2) + attn_output_48 = None + attn_output_49 = transpose_52.contiguous() + transpose_52 = None + reshape_12 = attn_output_49.reshape(1, 2, -1) + attn_output_49 = None + attn_output_50 = reshape_12.contiguous() + reshape_12 = None + attn_output_51 = torch._C._nn.linear( + attn_output_50, + l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_50 = l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_24 = torch.nn.functional.dropout(attn_output_51, 0.0, False, False) + attn_output_51 = None + hidden_states_111 = hidden_states_107 + dropout_24 + hidden_states_107 = dropout_24 = None + hidden_states_112 = hidden_states_111.to(torch.float32) + pow_26 = hidden_states_112.pow(2) + variance_25 = pow_26.mean(-1, keepdim=True) + pow_26 = None + add_76 = variance_25 + 1e-05 + variance_25 = None + rsqrt_25 = torch.rsqrt(add_76) + add_76 = None + hidden_states_113 = hidden_states_112 * rsqrt_25 + hidden_states_112 = rsqrt_25 = None + to_55 = hidden_states_113.to(torch.bfloat16) + hidden_states_113 = None + hidden_states_114 = ( + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + * to_55 + ) + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = ( + to_55 + ) = None + up_states_36 = torch._C._nn.linear( + hidden_states_114, + l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_114 = l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_12 = up_states_36.chunk(2, dim=-1) + up_states_36 = None + gate_12 = chunk_12[0] + up_states_37 = chunk_12[1] + chunk_12 = None + silu_12 = torch.nn.functional.silu(gate_12, inplace=False) + gate_12 = None + up_states_38 = up_states_37 * silu_12 + up_states_37 = silu_12 = None + hidden_states_115 = torch._C._nn.linear( + up_states_38, + l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_38 = l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_25 = torch.nn.functional.dropout(hidden_states_115, 0.0, False, False) + hidden_states_115 = None + hidden_states_116 = hidden_states_111 + dropout_25 + hidden_states_111 = dropout_25 = None + hidden_states_117 = hidden_states_116.to(torch.float32) + pow_27 = hidden_states_117.pow(2) + variance_26 = pow_27.mean(-1, keepdim=True) + pow_27 = None + add_78 = variance_26 + 1e-05 + variance_26 = None + rsqrt_26 = torch.rsqrt(add_78) + add_78 = None + hidden_states_118 = hidden_states_117 * rsqrt_26 + hidden_states_117 = rsqrt_26 = None + to_57 = hidden_states_118.to(torch.bfloat16) + hidden_states_118 = None + hidden_states_119 = ( + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + * to_57 + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + to_57 + ) = None + qkv_13 = torch._C._nn.linear( + hidden_states_119, + l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_119 = l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_26 = qkv_13[(Ellipsis, slice(None, 3072, None))] + key_states_26 = qkv_13[(Ellipsis, slice(3072, 6144, None))] + value_states_26 = qkv_13[(Ellipsis, slice(6144, None, None))] + qkv_13 = None + view_40 = query_states_26.view((1, 2, -1, 96)) + query_states_26 = None + query_states_27 = view_40.transpose(1, 2) + view_40 = None + view_41 = key_states_26.view((1, 2, -1, 96)) + key_states_26 = None + key_states_27 = view_41.transpose(1, 2) + view_41 = None + view_42 = value_states_26.view((1, 2, -1, 96)) + value_states_26 = None + value_states_27 = view_42.transpose(1, 2) + view_42 = None + cos_16 = cos_2.unsqueeze(1) + sin_16 = sin_2.unsqueeze(1) + q_rot_13 = query_states_27[(Ellipsis, slice(None, 96, None))] + q_pass_13 = query_states_27[(Ellipsis, slice(96, None, None))] + query_states_27 = None + k_rot_13 = key_states_27[(Ellipsis, slice(None, 96, None))] + k_pass_13 = key_states_27[(Ellipsis, slice(96, None, None))] + key_states_27 = None + mul_122 = q_rot_13 * cos_16 + x1_26 = q_rot_13[(Ellipsis, slice(None, 48, None))] + x2_26 = q_rot_13[(Ellipsis, slice(48, None, None))] + q_rot_13 = None + neg_26 = -x2_26 + x2_26 = None + cat_53 = torch.cat((neg_26, x1_26), dim=-1) + neg_26 = x1_26 = None + mul_123 = cat_53 * sin_16 + cat_53 = None + add_79 = mul_122 + mul_123 + mul_122 = mul_123 = None + q_embed_13 = torch.cat([add_79, q_pass_13], dim=-1) + add_79 = q_pass_13 = None + mul_124 = k_rot_13 * cos_16 + cos_16 = None + x1_27 = k_rot_13[(Ellipsis, slice(None, 48, None))] + x2_27 = k_rot_13[(Ellipsis, slice(48, None, None))] + k_rot_13 = None + neg_27 = -x2_27 + x2_27 = None + cat_55 = torch.cat((neg_27, x1_27), dim=-1) + neg_27 = x1_27 = None + mul_125 = cat_55 * sin_16 + cat_55 = sin_16 = None + add_80 = mul_124 + mul_125 + mul_124 = mul_125 = None + k_embed_13 = torch.cat([add_80, k_pass_13], dim=-1) + add_80 = k_pass_13 = None + attention_mask_14 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_13 = q_embed_13.contiguous() + q_embed_13 = None + key_13 = k_embed_13.contiguous() + value_13 = value_states_27.contiguous() + attn_output_52 = torch._C._nn.scaled_dot_product_attention( + query_13, + key_13, + value_13, + attn_mask=attention_mask_14, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_13 = key_13 = value_13 = attention_mask_14 = None + transpose_56 = attn_output_52.transpose(1, 2) + attn_output_52 = None + attn_output_53 = transpose_56.contiguous() + transpose_56 = None + reshape_13 = attn_output_53.reshape(1, 2, -1) + attn_output_53 = None + attn_output_54 = reshape_13.contiguous() + reshape_13 = None + attn_output_55 = torch._C._nn.linear( + attn_output_54, + l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_54 = l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_26 = torch.nn.functional.dropout(attn_output_55, 0.0, False, False) + attn_output_55 = None + hidden_states_120 = hidden_states_116 + dropout_26 + hidden_states_116 = dropout_26 = None + hidden_states_121 = hidden_states_120.to(torch.float32) + pow_28 = hidden_states_121.pow(2) + variance_27 = pow_28.mean(-1, keepdim=True) + pow_28 = None + add_82 = variance_27 + 1e-05 + variance_27 = None + rsqrt_27 = torch.rsqrt(add_82) + add_82 = None + hidden_states_122 = hidden_states_121 * rsqrt_27 + hidden_states_121 = rsqrt_27 = None + to_59 = hidden_states_122.to(torch.bfloat16) + hidden_states_122 = None + hidden_states_123 = ( + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + * to_59 + ) + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = ( + to_59 + ) = None + up_states_39 = torch._C._nn.linear( + hidden_states_123, + l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_123 = l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_13 = up_states_39.chunk(2, dim=-1) + up_states_39 = None + gate_13 = chunk_13[0] + up_states_40 = chunk_13[1] + chunk_13 = None + silu_13 = torch.nn.functional.silu(gate_13, inplace=False) + gate_13 = None + up_states_41 = up_states_40 * silu_13 + up_states_40 = silu_13 = None + hidden_states_124 = torch._C._nn.linear( + up_states_41, + l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_41 = l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_27 = torch.nn.functional.dropout(hidden_states_124, 0.0, False, False) + hidden_states_124 = None + hidden_states_125 = hidden_states_120 + dropout_27 + hidden_states_120 = dropout_27 = None + hidden_states_126 = hidden_states_125.to(torch.float32) + pow_29 = hidden_states_126.pow(2) + variance_28 = pow_29.mean(-1, keepdim=True) + pow_29 = None + add_84 = variance_28 + 1e-05 + variance_28 = None + rsqrt_28 = torch.rsqrt(add_84) + add_84 = None + hidden_states_127 = hidden_states_126 * rsqrt_28 + hidden_states_126 = rsqrt_28 = None + to_61 = hidden_states_127.to(torch.bfloat16) + hidden_states_127 = None + hidden_states_128 = ( + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + * to_61 + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + to_61 + ) = None + qkv_14 = torch._C._nn.linear( + hidden_states_128, + l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_128 = l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_28 = qkv_14[(Ellipsis, slice(None, 3072, None))] + key_states_28 = qkv_14[(Ellipsis, slice(3072, 6144, None))] + value_states_28 = qkv_14[(Ellipsis, slice(6144, None, None))] + qkv_14 = None + view_43 = query_states_28.view((1, 2, -1, 96)) + query_states_28 = None + query_states_29 = view_43.transpose(1, 2) + view_43 = None + view_44 = key_states_28.view((1, 2, -1, 96)) + key_states_28 = None + key_states_29 = view_44.transpose(1, 2) + view_44 = None + view_45 = value_states_28.view((1, 2, -1, 96)) + value_states_28 = None + value_states_29 = view_45.transpose(1, 2) + view_45 = None + cos_17 = cos_2.unsqueeze(1) + sin_17 = sin_2.unsqueeze(1) + q_rot_14 = query_states_29[(Ellipsis, slice(None, 96, None))] + q_pass_14 = query_states_29[(Ellipsis, slice(96, None, None))] + query_states_29 = None + k_rot_14 = key_states_29[(Ellipsis, slice(None, 96, None))] + k_pass_14 = key_states_29[(Ellipsis, slice(96, None, None))] + key_states_29 = None + mul_131 = q_rot_14 * cos_17 + x1_28 = q_rot_14[(Ellipsis, slice(None, 48, None))] + x2_28 = q_rot_14[(Ellipsis, slice(48, None, None))] + q_rot_14 = None + neg_28 = -x2_28 + x2_28 = None + cat_57 = torch.cat((neg_28, x1_28), dim=-1) + neg_28 = x1_28 = None + mul_132 = cat_57 * sin_17 + cat_57 = None + add_85 = mul_131 + mul_132 + mul_131 = mul_132 = None + q_embed_14 = torch.cat([add_85, q_pass_14], dim=-1) + add_85 = q_pass_14 = None + mul_133 = k_rot_14 * cos_17 + cos_17 = None + x1_29 = k_rot_14[(Ellipsis, slice(None, 48, None))] + x2_29 = k_rot_14[(Ellipsis, slice(48, None, None))] + k_rot_14 = None + neg_29 = -x2_29 + x2_29 = None + cat_59 = torch.cat((neg_29, x1_29), dim=-1) + neg_29 = x1_29 = None + mul_134 = cat_59 * sin_17 + cat_59 = sin_17 = None + add_86 = mul_133 + mul_134 + mul_133 = mul_134 = None + k_embed_14 = torch.cat([add_86, k_pass_14], dim=-1) + add_86 = k_pass_14 = None + attention_mask_15 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_14 = q_embed_14.contiguous() + q_embed_14 = None + key_14 = k_embed_14.contiguous() + value_14 = value_states_29.contiguous() + attn_output_56 = torch._C._nn.scaled_dot_product_attention( + query_14, + key_14, + value_14, + attn_mask=attention_mask_15, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_14 = key_14 = value_14 = attention_mask_15 = None + transpose_60 = attn_output_56.transpose(1, 2) + attn_output_56 = None + attn_output_57 = transpose_60.contiguous() + transpose_60 = None + reshape_14 = attn_output_57.reshape(1, 2, -1) + attn_output_57 = None + attn_output_58 = reshape_14.contiguous() + reshape_14 = None + attn_output_59 = torch._C._nn.linear( + attn_output_58, + l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_58 = l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_28 = torch.nn.functional.dropout(attn_output_59, 0.0, False, False) + attn_output_59 = None + hidden_states_129 = hidden_states_125 + dropout_28 + hidden_states_125 = dropout_28 = None + hidden_states_130 = hidden_states_129.to(torch.float32) + pow_30 = hidden_states_130.pow(2) + variance_29 = pow_30.mean(-1, keepdim=True) + pow_30 = None + add_88 = variance_29 + 1e-05 + variance_29 = None + rsqrt_29 = torch.rsqrt(add_88) + add_88 = None + hidden_states_131 = hidden_states_130 * rsqrt_29 + hidden_states_130 = rsqrt_29 = None + to_63 = hidden_states_131.to(torch.bfloat16) + hidden_states_131 = None + hidden_states_132 = ( + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + * to_63 + ) + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = ( + to_63 + ) = None + up_states_42 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_132 = l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_14 = up_states_42.chunk(2, dim=-1) + up_states_42 = None + gate_14 = chunk_14[0] + up_states_43 = chunk_14[1] + chunk_14 = None + silu_14 = torch.nn.functional.silu(gate_14, inplace=False) + gate_14 = None + up_states_44 = up_states_43 * silu_14 + up_states_43 = silu_14 = None + hidden_states_133 = torch._C._nn.linear( + up_states_44, + l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_44 = l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_29 = torch.nn.functional.dropout(hidden_states_133, 0.0, False, False) + hidden_states_133 = None + hidden_states_134 = hidden_states_129 + dropout_29 + hidden_states_129 = dropout_29 = None + hidden_states_135 = hidden_states_134.to(torch.float32) + pow_31 = hidden_states_135.pow(2) + variance_30 = pow_31.mean(-1, keepdim=True) + pow_31 = None + add_90 = variance_30 + 1e-05 + variance_30 = None + rsqrt_30 = torch.rsqrt(add_90) + add_90 = None + hidden_states_136 = hidden_states_135 * rsqrt_30 + hidden_states_135 = rsqrt_30 = None + to_65 = hidden_states_136.to(torch.bfloat16) + hidden_states_136 = None + hidden_states_137 = ( + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + * to_65 + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + to_65 + ) = None + qkv_15 = torch._C._nn.linear( + hidden_states_137, + l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_137 = l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_30 = qkv_15[(Ellipsis, slice(None, 3072, None))] + key_states_30 = qkv_15[(Ellipsis, slice(3072, 6144, None))] + value_states_30 = qkv_15[(Ellipsis, slice(6144, None, None))] + qkv_15 = None + view_46 = query_states_30.view((1, 2, -1, 96)) + query_states_30 = None + query_states_31 = view_46.transpose(1, 2) + view_46 = None + view_47 = key_states_30.view((1, 2, -1, 96)) + key_states_30 = None + key_states_31 = view_47.transpose(1, 2) + view_47 = None + view_48 = value_states_30.view((1, 2, -1, 96)) + value_states_30 = None + value_states_31 = view_48.transpose(1, 2) + view_48 = None + cos_18 = cos_2.unsqueeze(1) + sin_18 = sin_2.unsqueeze(1) + q_rot_15 = query_states_31[(Ellipsis, slice(None, 96, None))] + q_pass_15 = query_states_31[(Ellipsis, slice(96, None, None))] + query_states_31 = None + k_rot_15 = key_states_31[(Ellipsis, slice(None, 96, None))] + k_pass_15 = key_states_31[(Ellipsis, slice(96, None, None))] + key_states_31 = None + mul_140 = q_rot_15 * cos_18 + x1_30 = q_rot_15[(Ellipsis, slice(None, 48, None))] + x2_30 = q_rot_15[(Ellipsis, slice(48, None, None))] + q_rot_15 = None + neg_30 = -x2_30 + x2_30 = None + cat_61 = torch.cat((neg_30, x1_30), dim=-1) + neg_30 = x1_30 = None + mul_141 = cat_61 * sin_18 + cat_61 = None + add_91 = mul_140 + mul_141 + mul_140 = mul_141 = None + q_embed_15 = torch.cat([add_91, q_pass_15], dim=-1) + add_91 = q_pass_15 = None + mul_142 = k_rot_15 * cos_18 + cos_18 = None + x1_31 = k_rot_15[(Ellipsis, slice(None, 48, None))] + x2_31 = k_rot_15[(Ellipsis, slice(48, None, None))] + k_rot_15 = None + neg_31 = -x2_31 + x2_31 = None + cat_63 = torch.cat((neg_31, x1_31), dim=-1) + neg_31 = x1_31 = None + mul_143 = cat_63 * sin_18 + cat_63 = sin_18 = None + add_92 = mul_142 + mul_143 + mul_142 = mul_143 = None + k_embed_15 = torch.cat([add_92, k_pass_15], dim=-1) + add_92 = k_pass_15 = None + attention_mask_16 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_15 = q_embed_15.contiguous() + q_embed_15 = None + key_15 = k_embed_15.contiguous() + value_15 = value_states_31.contiguous() + attn_output_60 = torch._C._nn.scaled_dot_product_attention( + query_15, + key_15, + value_15, + attn_mask=attention_mask_16, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_15 = key_15 = value_15 = attention_mask_16 = None + transpose_64 = attn_output_60.transpose(1, 2) + attn_output_60 = None + attn_output_61 = transpose_64.contiguous() + transpose_64 = None + reshape_15 = attn_output_61.reshape(1, 2, -1) + attn_output_61 = None + attn_output_62 = reshape_15.contiguous() + reshape_15 = None + attn_output_63 = torch._C._nn.linear( + attn_output_62, + l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_62 = l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_30 = torch.nn.functional.dropout(attn_output_63, 0.0, False, False) + attn_output_63 = None + hidden_states_138 = hidden_states_134 + dropout_30 + hidden_states_134 = dropout_30 = None + hidden_states_139 = hidden_states_138.to(torch.float32) + pow_32 = hidden_states_139.pow(2) + variance_31 = pow_32.mean(-1, keepdim=True) + pow_32 = None + add_94 = variance_31 + 1e-05 + variance_31 = None + rsqrt_31 = torch.rsqrt(add_94) + add_94 = None + hidden_states_140 = hidden_states_139 * rsqrt_31 + hidden_states_139 = rsqrt_31 = None + to_67 = hidden_states_140.to(torch.bfloat16) + hidden_states_140 = None + hidden_states_141 = ( + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + * to_67 + ) + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = ( + to_67 + ) = None + up_states_45 = torch._C._nn.linear( + hidden_states_141, + l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_141 = l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_15 = up_states_45.chunk(2, dim=-1) + up_states_45 = None + gate_15 = chunk_15[0] + up_states_46 = chunk_15[1] + chunk_15 = None + silu_15 = torch.nn.functional.silu(gate_15, inplace=False) + gate_15 = None + up_states_47 = up_states_46 * silu_15 + up_states_46 = silu_15 = None + hidden_states_142 = torch._C._nn.linear( + up_states_47, + l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_47 = l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_31 = torch.nn.functional.dropout(hidden_states_142, 0.0, False, False) + hidden_states_142 = None + hidden_states_143 = hidden_states_138 + dropout_31 + hidden_states_138 = dropout_31 = None + hidden_states_144 = hidden_states_143.to(torch.float32) + pow_33 = hidden_states_144.pow(2) + variance_32 = pow_33.mean(-1, keepdim=True) + pow_33 = None + add_96 = variance_32 + 1e-05 + variance_32 = None + rsqrt_32 = torch.rsqrt(add_96) + add_96 = None + hidden_states_145 = hidden_states_144 * rsqrt_32 + hidden_states_144 = rsqrt_32 = None + to_69 = hidden_states_145.to(torch.bfloat16) + hidden_states_145 = None + hidden_states_146 = ( + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + * to_69 + ) + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + to_69 + ) = None + qkv_16 = torch._C._nn.linear( + hidden_states_146, + l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_146 = l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_32 = qkv_16[(Ellipsis, slice(None, 3072, None))] + key_states_32 = qkv_16[(Ellipsis, slice(3072, 6144, None))] + value_states_32 = qkv_16[(Ellipsis, slice(6144, None, None))] + qkv_16 = None + view_49 = query_states_32.view((1, 2, -1, 96)) + query_states_32 = None + query_states_33 = view_49.transpose(1, 2) + view_49 = None + view_50 = key_states_32.view((1, 2, -1, 96)) + key_states_32 = None + key_states_33 = view_50.transpose(1, 2) + view_50 = None + view_51 = value_states_32.view((1, 2, -1, 96)) + value_states_32 = None + value_states_33 = view_51.transpose(1, 2) + view_51 = None + cos_19 = cos_2.unsqueeze(1) + sin_19 = sin_2.unsqueeze(1) + q_rot_16 = query_states_33[(Ellipsis, slice(None, 96, None))] + q_pass_16 = query_states_33[(Ellipsis, slice(96, None, None))] + query_states_33 = None + k_rot_16 = key_states_33[(Ellipsis, slice(None, 96, None))] + k_pass_16 = key_states_33[(Ellipsis, slice(96, None, None))] + key_states_33 = None + mul_149 = q_rot_16 * cos_19 + x1_32 = q_rot_16[(Ellipsis, slice(None, 48, None))] + x2_32 = q_rot_16[(Ellipsis, slice(48, None, None))] + q_rot_16 = None + neg_32 = -x2_32 + x2_32 = None + cat_65 = torch.cat((neg_32, x1_32), dim=-1) + neg_32 = x1_32 = None + mul_150 = cat_65 * sin_19 + cat_65 = None + add_97 = mul_149 + mul_150 + mul_149 = mul_150 = None + q_embed_16 = torch.cat([add_97, q_pass_16], dim=-1) + add_97 = q_pass_16 = None + mul_151 = k_rot_16 * cos_19 + cos_19 = None + x1_33 = k_rot_16[(Ellipsis, slice(None, 48, None))] + x2_33 = k_rot_16[(Ellipsis, slice(48, None, None))] + k_rot_16 = None + neg_33 = -x2_33 + x2_33 = None + cat_67 = torch.cat((neg_33, x1_33), dim=-1) + neg_33 = x1_33 = None + mul_152 = cat_67 * sin_19 + cat_67 = sin_19 = None + add_98 = mul_151 + mul_152 + mul_151 = mul_152 = None + k_embed_16 = torch.cat([add_98, k_pass_16], dim=-1) + add_98 = k_pass_16 = None + attention_mask_17 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_16 = q_embed_16.contiguous() + q_embed_16 = None + key_16 = k_embed_16.contiguous() + value_16 = value_states_33.contiguous() + attn_output_64 = torch._C._nn.scaled_dot_product_attention( + query_16, + key_16, + value_16, + attn_mask=attention_mask_17, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_16 = key_16 = value_16 = attention_mask_17 = None + transpose_68 = attn_output_64.transpose(1, 2) + attn_output_64 = None + attn_output_65 = transpose_68.contiguous() + transpose_68 = None + reshape_16 = attn_output_65.reshape(1, 2, -1) + attn_output_65 = None + attn_output_66 = reshape_16.contiguous() + reshape_16 = None + attn_output_67 = torch._C._nn.linear( + attn_output_66, + l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_66 = l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_32 = torch.nn.functional.dropout(attn_output_67, 0.0, False, False) + attn_output_67 = None + hidden_states_147 = hidden_states_143 + dropout_32 + hidden_states_143 = dropout_32 = None + hidden_states_148 = hidden_states_147.to(torch.float32) + pow_34 = hidden_states_148.pow(2) + variance_33 = pow_34.mean(-1, keepdim=True) + pow_34 = None + add_100 = variance_33 + 1e-05 + variance_33 = None + rsqrt_33 = torch.rsqrt(add_100) + add_100 = None + hidden_states_149 = hidden_states_148 * rsqrt_33 + hidden_states_148 = rsqrt_33 = None + to_71 = hidden_states_149.to(torch.bfloat16) + hidden_states_149 = None + hidden_states_150 = ( + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + * to_71 + ) + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = ( + to_71 + ) = None + up_states_48 = torch._C._nn.linear( + hidden_states_150, + l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_150 = l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_16 = up_states_48.chunk(2, dim=-1) + up_states_48 = None + gate_16 = chunk_16[0] + up_states_49 = chunk_16[1] + chunk_16 = None + silu_16 = torch.nn.functional.silu(gate_16, inplace=False) + gate_16 = None + up_states_50 = up_states_49 * silu_16 + up_states_49 = silu_16 = None + hidden_states_151 = torch._C._nn.linear( + up_states_50, + l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_50 = l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_33 = torch.nn.functional.dropout(hidden_states_151, 0.0, False, False) + hidden_states_151 = None + hidden_states_152 = hidden_states_147 + dropout_33 + hidden_states_147 = dropout_33 = None + hidden_states_153 = hidden_states_152.to(torch.float32) + pow_35 = hidden_states_153.pow(2) + variance_34 = pow_35.mean(-1, keepdim=True) + pow_35 = None + add_102 = variance_34 + 1e-05 + variance_34 = None + rsqrt_34 = torch.rsqrt(add_102) + add_102 = None + hidden_states_154 = hidden_states_153 * rsqrt_34 + hidden_states_153 = rsqrt_34 = None + to_73 = hidden_states_154.to(torch.bfloat16) + hidden_states_154 = None + hidden_states_155 = ( + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + * to_73 + ) + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + to_73 + ) = None + qkv_17 = torch._C._nn.linear( + hidden_states_155, + l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_155 = l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_34 = qkv_17[(Ellipsis, slice(None, 3072, None))] + key_states_34 = qkv_17[(Ellipsis, slice(3072, 6144, None))] + value_states_34 = qkv_17[(Ellipsis, slice(6144, None, None))] + qkv_17 = None + view_52 = query_states_34.view((1, 2, -1, 96)) + query_states_34 = None + query_states_35 = view_52.transpose(1, 2) + view_52 = None + view_53 = key_states_34.view((1, 2, -1, 96)) + key_states_34 = None + key_states_35 = view_53.transpose(1, 2) + view_53 = None + view_54 = value_states_34.view((1, 2, -1, 96)) + value_states_34 = None + value_states_35 = view_54.transpose(1, 2) + view_54 = None + cos_20 = cos_2.unsqueeze(1) + sin_20 = sin_2.unsqueeze(1) + q_rot_17 = query_states_35[(Ellipsis, slice(None, 96, None))] + q_pass_17 = query_states_35[(Ellipsis, slice(96, None, None))] + query_states_35 = None + k_rot_17 = key_states_35[(Ellipsis, slice(None, 96, None))] + k_pass_17 = key_states_35[(Ellipsis, slice(96, None, None))] + key_states_35 = None + mul_158 = q_rot_17 * cos_20 + x1_34 = q_rot_17[(Ellipsis, slice(None, 48, None))] + x2_34 = q_rot_17[(Ellipsis, slice(48, None, None))] + q_rot_17 = None + neg_34 = -x2_34 + x2_34 = None + cat_69 = torch.cat((neg_34, x1_34), dim=-1) + neg_34 = x1_34 = None + mul_159 = cat_69 * sin_20 + cat_69 = None + add_103 = mul_158 + mul_159 + mul_158 = mul_159 = None + q_embed_17 = torch.cat([add_103, q_pass_17], dim=-1) + add_103 = q_pass_17 = None + mul_160 = k_rot_17 * cos_20 + cos_20 = None + x1_35 = k_rot_17[(Ellipsis, slice(None, 48, None))] + x2_35 = k_rot_17[(Ellipsis, slice(48, None, None))] + k_rot_17 = None + neg_35 = -x2_35 + x2_35 = None + cat_71 = torch.cat((neg_35, x1_35), dim=-1) + neg_35 = x1_35 = None + mul_161 = cat_71 * sin_20 + cat_71 = sin_20 = None + add_104 = mul_160 + mul_161 + mul_160 = mul_161 = None + k_embed_17 = torch.cat([add_104, k_pass_17], dim=-1) + add_104 = k_pass_17 = None + attention_mask_18 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_17 = q_embed_17.contiguous() + q_embed_17 = None + key_17 = k_embed_17.contiguous() + value_17 = value_states_35.contiguous() + attn_output_68 = torch._C._nn.scaled_dot_product_attention( + query_17, + key_17, + value_17, + attn_mask=attention_mask_18, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_17 = key_17 = value_17 = attention_mask_18 = None + transpose_72 = attn_output_68.transpose(1, 2) + attn_output_68 = None + attn_output_69 = transpose_72.contiguous() + transpose_72 = None + reshape_17 = attn_output_69.reshape(1, 2, -1) + attn_output_69 = None + attn_output_70 = reshape_17.contiguous() + reshape_17 = None + attn_output_71 = torch._C._nn.linear( + attn_output_70, + l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_70 = l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_34 = torch.nn.functional.dropout(attn_output_71, 0.0, False, False) + attn_output_71 = None + hidden_states_156 = hidden_states_152 + dropout_34 + hidden_states_152 = dropout_34 = None + hidden_states_157 = hidden_states_156.to(torch.float32) + pow_36 = hidden_states_157.pow(2) + variance_35 = pow_36.mean(-1, keepdim=True) + pow_36 = None + add_106 = variance_35 + 1e-05 + variance_35 = None + rsqrt_35 = torch.rsqrt(add_106) + add_106 = None + hidden_states_158 = hidden_states_157 * rsqrt_35 + hidden_states_157 = rsqrt_35 = None + to_75 = hidden_states_158.to(torch.bfloat16) + hidden_states_158 = None + hidden_states_159 = ( + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + * to_75 + ) + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = ( + to_75 + ) = None + up_states_51 = torch._C._nn.linear( + hidden_states_159, + l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_159 = l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_17 = up_states_51.chunk(2, dim=-1) + up_states_51 = None + gate_17 = chunk_17[0] + up_states_52 = chunk_17[1] + chunk_17 = None + silu_17 = torch.nn.functional.silu(gate_17, inplace=False) + gate_17 = None + up_states_53 = up_states_52 * silu_17 + up_states_52 = silu_17 = None + hidden_states_160 = torch._C._nn.linear( + up_states_53, + l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_53 = l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_35 = torch.nn.functional.dropout(hidden_states_160, 0.0, False, False) + hidden_states_160 = None + hidden_states_161 = hidden_states_156 + dropout_35 + hidden_states_156 = dropout_35 = None + hidden_states_162 = hidden_states_161.to(torch.float32) + pow_37 = hidden_states_162.pow(2) + variance_36 = pow_37.mean(-1, keepdim=True) + pow_37 = None + add_108 = variance_36 + 1e-05 + variance_36 = None + rsqrt_36 = torch.rsqrt(add_108) + add_108 = None + hidden_states_163 = hidden_states_162 * rsqrt_36 + hidden_states_162 = rsqrt_36 = None + to_77 = hidden_states_163.to(torch.bfloat16) + hidden_states_163 = None + hidden_states_164 = ( + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + * to_77 + ) + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + to_77 + ) = None + qkv_18 = torch._C._nn.linear( + hidden_states_164, + l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_164 = l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_36 = qkv_18[(Ellipsis, slice(None, 3072, None))] + key_states_36 = qkv_18[(Ellipsis, slice(3072, 6144, None))] + value_states_36 = qkv_18[(Ellipsis, slice(6144, None, None))] + qkv_18 = None + view_55 = query_states_36.view((1, 2, -1, 96)) + query_states_36 = None + query_states_37 = view_55.transpose(1, 2) + view_55 = None + view_56 = key_states_36.view((1, 2, -1, 96)) + key_states_36 = None + key_states_37 = view_56.transpose(1, 2) + view_56 = None + view_57 = value_states_36.view((1, 2, -1, 96)) + value_states_36 = None + value_states_37 = view_57.transpose(1, 2) + view_57 = None + cos_21 = cos_2.unsqueeze(1) + sin_21 = sin_2.unsqueeze(1) + q_rot_18 = query_states_37[(Ellipsis, slice(None, 96, None))] + q_pass_18 = query_states_37[(Ellipsis, slice(96, None, None))] + query_states_37 = None + k_rot_18 = key_states_37[(Ellipsis, slice(None, 96, None))] + k_pass_18 = key_states_37[(Ellipsis, slice(96, None, None))] + key_states_37 = None + mul_167 = q_rot_18 * cos_21 + x1_36 = q_rot_18[(Ellipsis, slice(None, 48, None))] + x2_36 = q_rot_18[(Ellipsis, slice(48, None, None))] + q_rot_18 = None + neg_36 = -x2_36 + x2_36 = None + cat_73 = torch.cat((neg_36, x1_36), dim=-1) + neg_36 = x1_36 = None + mul_168 = cat_73 * sin_21 + cat_73 = None + add_109 = mul_167 + mul_168 + mul_167 = mul_168 = None + q_embed_18 = torch.cat([add_109, q_pass_18], dim=-1) + add_109 = q_pass_18 = None + mul_169 = k_rot_18 * cos_21 + cos_21 = None + x1_37 = k_rot_18[(Ellipsis, slice(None, 48, None))] + x2_37 = k_rot_18[(Ellipsis, slice(48, None, None))] + k_rot_18 = None + neg_37 = -x2_37 + x2_37 = None + cat_75 = torch.cat((neg_37, x1_37), dim=-1) + neg_37 = x1_37 = None + mul_170 = cat_75 * sin_21 + cat_75 = sin_21 = None + add_110 = mul_169 + mul_170 + mul_169 = mul_170 = None + k_embed_18 = torch.cat([add_110, k_pass_18], dim=-1) + add_110 = k_pass_18 = None + attention_mask_19 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_18 = q_embed_18.contiguous() + q_embed_18 = None + key_18 = k_embed_18.contiguous() + value_18 = value_states_37.contiguous() + attn_output_72 = torch._C._nn.scaled_dot_product_attention( + query_18, + key_18, + value_18, + attn_mask=attention_mask_19, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_18 = key_18 = value_18 = attention_mask_19 = None + transpose_76 = attn_output_72.transpose(1, 2) + attn_output_72 = None + attn_output_73 = transpose_76.contiguous() + transpose_76 = None + reshape_18 = attn_output_73.reshape(1, 2, -1) + attn_output_73 = None + attn_output_74 = reshape_18.contiguous() + reshape_18 = None + attn_output_75 = torch._C._nn.linear( + attn_output_74, + l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_74 = l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_36 = torch.nn.functional.dropout(attn_output_75, 0.0, False, False) + attn_output_75 = None + hidden_states_165 = hidden_states_161 + dropout_36 + hidden_states_161 = dropout_36 = None + hidden_states_166 = hidden_states_165.to(torch.float32) + pow_38 = hidden_states_166.pow(2) + variance_37 = pow_38.mean(-1, keepdim=True) + pow_38 = None + add_112 = variance_37 + 1e-05 + variance_37 = None + rsqrt_37 = torch.rsqrt(add_112) + add_112 = None + hidden_states_167 = hidden_states_166 * rsqrt_37 + hidden_states_166 = rsqrt_37 = None + to_79 = hidden_states_167.to(torch.bfloat16) + hidden_states_167 = None + hidden_states_168 = ( + l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + * to_79 + ) + l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = ( + to_79 + ) = None + up_states_54 = torch._C._nn.linear( + hidden_states_168, + l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_168 = l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_18 = up_states_54.chunk(2, dim=-1) + up_states_54 = None + gate_18 = chunk_18[0] + up_states_55 = chunk_18[1] + chunk_18 = None + silu_18 = torch.nn.functional.silu(gate_18, inplace=False) + gate_18 = None + up_states_56 = up_states_55 * silu_18 + up_states_55 = silu_18 = None + hidden_states_169 = torch._C._nn.linear( + up_states_56, + l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_56 = l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_37 = torch.nn.functional.dropout(hidden_states_169, 0.0, False, False) + hidden_states_169 = None + hidden_states_170 = hidden_states_165 + dropout_37 + hidden_states_165 = dropout_37 = None + hidden_states_171 = hidden_states_170.to(torch.float32) + pow_39 = hidden_states_171.pow(2) + variance_38 = pow_39.mean(-1, keepdim=True) + pow_39 = None + add_114 = variance_38 + 1e-05 + variance_38 = None + rsqrt_38 = torch.rsqrt(add_114) + add_114 = None + hidden_states_172 = hidden_states_171 * rsqrt_38 + hidden_states_171 = rsqrt_38 = None + to_81 = hidden_states_172.to(torch.bfloat16) + hidden_states_172 = None + hidden_states_173 = ( + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + * to_81 + ) + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + to_81 + ) = None + qkv_19 = torch._C._nn.linear( + hidden_states_173, + l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_173 = l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_38 = qkv_19[(Ellipsis, slice(None, 3072, None))] + key_states_38 = qkv_19[(Ellipsis, slice(3072, 6144, None))] + value_states_38 = qkv_19[(Ellipsis, slice(6144, None, None))] + qkv_19 = None + view_58 = query_states_38.view((1, 2, -1, 96)) + query_states_38 = None + query_states_39 = view_58.transpose(1, 2) + view_58 = None + view_59 = key_states_38.view((1, 2, -1, 96)) + key_states_38 = None + key_states_39 = view_59.transpose(1, 2) + view_59 = None + view_60 = value_states_38.view((1, 2, -1, 96)) + value_states_38 = None + value_states_39 = view_60.transpose(1, 2) + view_60 = None + cos_22 = cos_2.unsqueeze(1) + sin_22 = sin_2.unsqueeze(1) + q_rot_19 = query_states_39[(Ellipsis, slice(None, 96, None))] + q_pass_19 = query_states_39[(Ellipsis, slice(96, None, None))] + query_states_39 = None + k_rot_19 = key_states_39[(Ellipsis, slice(None, 96, None))] + k_pass_19 = key_states_39[(Ellipsis, slice(96, None, None))] + key_states_39 = None + mul_176 = q_rot_19 * cos_22 + x1_38 = q_rot_19[(Ellipsis, slice(None, 48, None))] + x2_38 = q_rot_19[(Ellipsis, slice(48, None, None))] + q_rot_19 = None + neg_38 = -x2_38 + x2_38 = None + cat_77 = torch.cat((neg_38, x1_38), dim=-1) + neg_38 = x1_38 = None + mul_177 = cat_77 * sin_22 + cat_77 = None + add_115 = mul_176 + mul_177 + mul_176 = mul_177 = None + q_embed_19 = torch.cat([add_115, q_pass_19], dim=-1) + add_115 = q_pass_19 = None + mul_178 = k_rot_19 * cos_22 + cos_22 = None + x1_39 = k_rot_19[(Ellipsis, slice(None, 48, None))] + x2_39 = k_rot_19[(Ellipsis, slice(48, None, None))] + k_rot_19 = None + neg_39 = -x2_39 + x2_39 = None + cat_79 = torch.cat((neg_39, x1_39), dim=-1) + neg_39 = x1_39 = None + mul_179 = cat_79 * sin_22 + cat_79 = sin_22 = None + add_116 = mul_178 + mul_179 + mul_178 = mul_179 = None + k_embed_19 = torch.cat([add_116, k_pass_19], dim=-1) + add_116 = k_pass_19 = None + attention_mask_20 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_19 = q_embed_19.contiguous() + q_embed_19 = None + key_19 = k_embed_19.contiguous() + value_19 = value_states_39.contiguous() + attn_output_76 = torch._C._nn.scaled_dot_product_attention( + query_19, + key_19, + value_19, + attn_mask=attention_mask_20, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_19 = key_19 = value_19 = attention_mask_20 = None + transpose_80 = attn_output_76.transpose(1, 2) + attn_output_76 = None + attn_output_77 = transpose_80.contiguous() + transpose_80 = None + reshape_19 = attn_output_77.reshape(1, 2, -1) + attn_output_77 = None + attn_output_78 = reshape_19.contiguous() + reshape_19 = None + attn_output_79 = torch._C._nn.linear( + attn_output_78, + l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_78 = l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_38 = torch.nn.functional.dropout(attn_output_79, 0.0, False, False) + attn_output_79 = None + hidden_states_174 = hidden_states_170 + dropout_38 + hidden_states_170 = dropout_38 = None + hidden_states_175 = hidden_states_174.to(torch.float32) + pow_40 = hidden_states_175.pow(2) + variance_39 = pow_40.mean(-1, keepdim=True) + pow_40 = None + add_118 = variance_39 + 1e-05 + variance_39 = None + rsqrt_39 = torch.rsqrt(add_118) + add_118 = None + hidden_states_176 = hidden_states_175 * rsqrt_39 + hidden_states_175 = rsqrt_39 = None + to_83 = hidden_states_176.to(torch.bfloat16) + hidden_states_176 = None + hidden_states_177 = ( + l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + * to_83 + ) + l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = ( + to_83 + ) = None + up_states_57 = torch._C._nn.linear( + hidden_states_177, + l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_177 = l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_19 = up_states_57.chunk(2, dim=-1) + up_states_57 = None + gate_19 = chunk_19[0] + up_states_58 = chunk_19[1] + chunk_19 = None + silu_19 = torch.nn.functional.silu(gate_19, inplace=False) + gate_19 = None + up_states_59 = up_states_58 * silu_19 + up_states_58 = silu_19 = None + hidden_states_178 = torch._C._nn.linear( + up_states_59, + l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_59 = l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_39 = torch.nn.functional.dropout(hidden_states_178, 0.0, False, False) + hidden_states_178 = None + hidden_states_179 = hidden_states_174 + dropout_39 + hidden_states_174 = dropout_39 = None + hidden_states_180 = hidden_states_179.to(torch.float32) + pow_41 = hidden_states_180.pow(2) + variance_40 = pow_41.mean(-1, keepdim=True) + pow_41 = None + add_120 = variance_40 + 1e-05 + variance_40 = None + rsqrt_40 = torch.rsqrt(add_120) + add_120 = None + hidden_states_181 = hidden_states_180 * rsqrt_40 + hidden_states_180 = rsqrt_40 = None + to_85 = hidden_states_181.to(torch.bfloat16) + hidden_states_181 = None + hidden_states_182 = ( + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + * to_85 + ) + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + to_85 + ) = None + qkv_20 = torch._C._nn.linear( + hidden_states_182, + l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_182 = l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_40 = qkv_20[(Ellipsis, slice(None, 3072, None))] + key_states_40 = qkv_20[(Ellipsis, slice(3072, 6144, None))] + value_states_40 = qkv_20[(Ellipsis, slice(6144, None, None))] + qkv_20 = None + view_61 = query_states_40.view((1, 2, -1, 96)) + query_states_40 = None + query_states_41 = view_61.transpose(1, 2) + view_61 = None + view_62 = key_states_40.view((1, 2, -1, 96)) + key_states_40 = None + key_states_41 = view_62.transpose(1, 2) + view_62 = None + view_63 = value_states_40.view((1, 2, -1, 96)) + value_states_40 = None + value_states_41 = view_63.transpose(1, 2) + view_63 = None + cos_23 = cos_2.unsqueeze(1) + sin_23 = sin_2.unsqueeze(1) + q_rot_20 = query_states_41[(Ellipsis, slice(None, 96, None))] + q_pass_20 = query_states_41[(Ellipsis, slice(96, None, None))] + query_states_41 = None + k_rot_20 = key_states_41[(Ellipsis, slice(None, 96, None))] + k_pass_20 = key_states_41[(Ellipsis, slice(96, None, None))] + key_states_41 = None + mul_185 = q_rot_20 * cos_23 + x1_40 = q_rot_20[(Ellipsis, slice(None, 48, None))] + x2_40 = q_rot_20[(Ellipsis, slice(48, None, None))] + q_rot_20 = None + neg_40 = -x2_40 + x2_40 = None + cat_81 = torch.cat((neg_40, x1_40), dim=-1) + neg_40 = x1_40 = None + mul_186 = cat_81 * sin_23 + cat_81 = None + add_121 = mul_185 + mul_186 + mul_185 = mul_186 = None + q_embed_20 = torch.cat([add_121, q_pass_20], dim=-1) + add_121 = q_pass_20 = None + mul_187 = k_rot_20 * cos_23 + cos_23 = None + x1_41 = k_rot_20[(Ellipsis, slice(None, 48, None))] + x2_41 = k_rot_20[(Ellipsis, slice(48, None, None))] + k_rot_20 = None + neg_41 = -x2_41 + x2_41 = None + cat_83 = torch.cat((neg_41, x1_41), dim=-1) + neg_41 = x1_41 = None + mul_188 = cat_83 * sin_23 + cat_83 = sin_23 = None + add_122 = mul_187 + mul_188 + mul_187 = mul_188 = None + k_embed_20 = torch.cat([add_122, k_pass_20], dim=-1) + add_122 = k_pass_20 = None + attention_mask_21 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_20 = q_embed_20.contiguous() + q_embed_20 = None + key_20 = k_embed_20.contiguous() + value_20 = value_states_41.contiguous() + attn_output_80 = torch._C._nn.scaled_dot_product_attention( + query_20, + key_20, + value_20, + attn_mask=attention_mask_21, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_20 = key_20 = value_20 = attention_mask_21 = None + transpose_84 = attn_output_80.transpose(1, 2) + attn_output_80 = None + attn_output_81 = transpose_84.contiguous() + transpose_84 = None + reshape_20 = attn_output_81.reshape(1, 2, -1) + attn_output_81 = None + attn_output_82 = reshape_20.contiguous() + reshape_20 = None + attn_output_83 = torch._C._nn.linear( + attn_output_82, + l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_82 = l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_40 = torch.nn.functional.dropout(attn_output_83, 0.0, False, False) + attn_output_83 = None + hidden_states_183 = hidden_states_179 + dropout_40 + hidden_states_179 = dropout_40 = None + hidden_states_184 = hidden_states_183.to(torch.float32) + pow_42 = hidden_states_184.pow(2) + variance_41 = pow_42.mean(-1, keepdim=True) + pow_42 = None + add_124 = variance_41 + 1e-05 + variance_41 = None + rsqrt_41 = torch.rsqrt(add_124) + add_124 = None + hidden_states_185 = hidden_states_184 * rsqrt_41 + hidden_states_184 = rsqrt_41 = None + to_87 = hidden_states_185.to(torch.bfloat16) + hidden_states_185 = None + hidden_states_186 = ( + l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + * to_87 + ) + l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = ( + to_87 + ) = None + up_states_60 = torch._C._nn.linear( + hidden_states_186, + l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_186 = l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_20 = up_states_60.chunk(2, dim=-1) + up_states_60 = None + gate_20 = chunk_20[0] + up_states_61 = chunk_20[1] + chunk_20 = None + silu_20 = torch.nn.functional.silu(gate_20, inplace=False) + gate_20 = None + up_states_62 = up_states_61 * silu_20 + up_states_61 = silu_20 = None + hidden_states_187 = torch._C._nn.linear( + up_states_62, + l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_62 = l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_41 = torch.nn.functional.dropout(hidden_states_187, 0.0, False, False) + hidden_states_187 = None + hidden_states_188 = hidden_states_183 + dropout_41 + hidden_states_183 = dropout_41 = None + hidden_states_189 = hidden_states_188.to(torch.float32) + pow_43 = hidden_states_189.pow(2) + variance_42 = pow_43.mean(-1, keepdim=True) + pow_43 = None + add_126 = variance_42 + 1e-05 + variance_42 = None + rsqrt_42 = torch.rsqrt(add_126) + add_126 = None + hidden_states_190 = hidden_states_189 * rsqrt_42 + hidden_states_189 = rsqrt_42 = None + to_89 = hidden_states_190.to(torch.bfloat16) + hidden_states_190 = None + hidden_states_191 = ( + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + * to_89 + ) + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + to_89 + ) = None + qkv_21 = torch._C._nn.linear( + hidden_states_191, + l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_191 = l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_42 = qkv_21[(Ellipsis, slice(None, 3072, None))] + key_states_42 = qkv_21[(Ellipsis, slice(3072, 6144, None))] + value_states_42 = qkv_21[(Ellipsis, slice(6144, None, None))] + qkv_21 = None + view_64 = query_states_42.view((1, 2, -1, 96)) + query_states_42 = None + query_states_43 = view_64.transpose(1, 2) + view_64 = None + view_65 = key_states_42.view((1, 2, -1, 96)) + key_states_42 = None + key_states_43 = view_65.transpose(1, 2) + view_65 = None + view_66 = value_states_42.view((1, 2, -1, 96)) + value_states_42 = None + value_states_43 = view_66.transpose(1, 2) + view_66 = None + cos_24 = cos_2.unsqueeze(1) + sin_24 = sin_2.unsqueeze(1) + q_rot_21 = query_states_43[(Ellipsis, slice(None, 96, None))] + q_pass_21 = query_states_43[(Ellipsis, slice(96, None, None))] + query_states_43 = None + k_rot_21 = key_states_43[(Ellipsis, slice(None, 96, None))] + k_pass_21 = key_states_43[(Ellipsis, slice(96, None, None))] + key_states_43 = None + mul_194 = q_rot_21 * cos_24 + x1_42 = q_rot_21[(Ellipsis, slice(None, 48, None))] + x2_42 = q_rot_21[(Ellipsis, slice(48, None, None))] + q_rot_21 = None + neg_42 = -x2_42 + x2_42 = None + cat_85 = torch.cat((neg_42, x1_42), dim=-1) + neg_42 = x1_42 = None + mul_195 = cat_85 * sin_24 + cat_85 = None + add_127 = mul_194 + mul_195 + mul_194 = mul_195 = None + q_embed_21 = torch.cat([add_127, q_pass_21], dim=-1) + add_127 = q_pass_21 = None + mul_196 = k_rot_21 * cos_24 + cos_24 = None + x1_43 = k_rot_21[(Ellipsis, slice(None, 48, None))] + x2_43 = k_rot_21[(Ellipsis, slice(48, None, None))] + k_rot_21 = None + neg_43 = -x2_43 + x2_43 = None + cat_87 = torch.cat((neg_43, x1_43), dim=-1) + neg_43 = x1_43 = None + mul_197 = cat_87 * sin_24 + cat_87 = sin_24 = None + add_128 = mul_196 + mul_197 + mul_196 = mul_197 = None + k_embed_21 = torch.cat([add_128, k_pass_21], dim=-1) + add_128 = k_pass_21 = None + attention_mask_22 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_21 = q_embed_21.contiguous() + q_embed_21 = None + key_21 = k_embed_21.contiguous() + value_21 = value_states_43.contiguous() + attn_output_84 = torch._C._nn.scaled_dot_product_attention( + query_21, + key_21, + value_21, + attn_mask=attention_mask_22, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_21 = key_21 = value_21 = attention_mask_22 = None + transpose_88 = attn_output_84.transpose(1, 2) + attn_output_84 = None + attn_output_85 = transpose_88.contiguous() + transpose_88 = None + reshape_21 = attn_output_85.reshape(1, 2, -1) + attn_output_85 = None + attn_output_86 = reshape_21.contiguous() + reshape_21 = None + attn_output_87 = torch._C._nn.linear( + attn_output_86, + l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_86 = l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_42 = torch.nn.functional.dropout(attn_output_87, 0.0, False, False) + attn_output_87 = None + hidden_states_192 = hidden_states_188 + dropout_42 + hidden_states_188 = dropout_42 = None + hidden_states_193 = hidden_states_192.to(torch.float32) + pow_44 = hidden_states_193.pow(2) + variance_43 = pow_44.mean(-1, keepdim=True) + pow_44 = None + add_130 = variance_43 + 1e-05 + variance_43 = None + rsqrt_43 = torch.rsqrt(add_130) + add_130 = None + hidden_states_194 = hidden_states_193 * rsqrt_43 + hidden_states_193 = rsqrt_43 = None + to_91 = hidden_states_194.to(torch.bfloat16) + hidden_states_194 = None + hidden_states_195 = ( + l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + * to_91 + ) + l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = ( + to_91 + ) = None + up_states_63 = torch._C._nn.linear( + hidden_states_195, + l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_195 = l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_21 = up_states_63.chunk(2, dim=-1) + up_states_63 = None + gate_21 = chunk_21[0] + up_states_64 = chunk_21[1] + chunk_21 = None + silu_21 = torch.nn.functional.silu(gate_21, inplace=False) + gate_21 = None + up_states_65 = up_states_64 * silu_21 + up_states_64 = silu_21 = None + hidden_states_196 = torch._C._nn.linear( + up_states_65, + l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_65 = l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_43 = torch.nn.functional.dropout(hidden_states_196, 0.0, False, False) + hidden_states_196 = None + hidden_states_197 = hidden_states_192 + dropout_43 + hidden_states_192 = dropout_43 = None + hidden_states_198 = hidden_states_197.to(torch.float32) + pow_45 = hidden_states_198.pow(2) + variance_44 = pow_45.mean(-1, keepdim=True) + pow_45 = None + add_132 = variance_44 + 1e-05 + variance_44 = None + rsqrt_44 = torch.rsqrt(add_132) + add_132 = None + hidden_states_199 = hidden_states_198 * rsqrt_44 + hidden_states_198 = rsqrt_44 = None + to_93 = hidden_states_199.to(torch.bfloat16) + hidden_states_199 = None + hidden_states_200 = ( + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + * to_93 + ) + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + to_93 + ) = None + qkv_22 = torch._C._nn.linear( + hidden_states_200, + l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_200 = l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_44 = qkv_22[(Ellipsis, slice(None, 3072, None))] + key_states_44 = qkv_22[(Ellipsis, slice(3072, 6144, None))] + value_states_44 = qkv_22[(Ellipsis, slice(6144, None, None))] + qkv_22 = None + view_67 = query_states_44.view((1, 2, -1, 96)) + query_states_44 = None + query_states_45 = view_67.transpose(1, 2) + view_67 = None + view_68 = key_states_44.view((1, 2, -1, 96)) + key_states_44 = None + key_states_45 = view_68.transpose(1, 2) + view_68 = None + view_69 = value_states_44.view((1, 2, -1, 96)) + value_states_44 = None + value_states_45 = view_69.transpose(1, 2) + view_69 = None + cos_25 = cos_2.unsqueeze(1) + sin_25 = sin_2.unsqueeze(1) + q_rot_22 = query_states_45[(Ellipsis, slice(None, 96, None))] + q_pass_22 = query_states_45[(Ellipsis, slice(96, None, None))] + query_states_45 = None + k_rot_22 = key_states_45[(Ellipsis, slice(None, 96, None))] + k_pass_22 = key_states_45[(Ellipsis, slice(96, None, None))] + key_states_45 = None + mul_203 = q_rot_22 * cos_25 + x1_44 = q_rot_22[(Ellipsis, slice(None, 48, None))] + x2_44 = q_rot_22[(Ellipsis, slice(48, None, None))] + q_rot_22 = None + neg_44 = -x2_44 + x2_44 = None + cat_89 = torch.cat((neg_44, x1_44), dim=-1) + neg_44 = x1_44 = None + mul_204 = cat_89 * sin_25 + cat_89 = None + add_133 = mul_203 + mul_204 + mul_203 = mul_204 = None + q_embed_22 = torch.cat([add_133, q_pass_22], dim=-1) + add_133 = q_pass_22 = None + mul_205 = k_rot_22 * cos_25 + cos_25 = None + x1_45 = k_rot_22[(Ellipsis, slice(None, 48, None))] + x2_45 = k_rot_22[(Ellipsis, slice(48, None, None))] + k_rot_22 = None + neg_45 = -x2_45 + x2_45 = None + cat_91 = torch.cat((neg_45, x1_45), dim=-1) + neg_45 = x1_45 = None + mul_206 = cat_91 * sin_25 + cat_91 = sin_25 = None + add_134 = mul_205 + mul_206 + mul_205 = mul_206 = None + k_embed_22 = torch.cat([add_134, k_pass_22], dim=-1) + add_134 = k_pass_22 = None + attention_mask_23 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_22 = q_embed_22.contiguous() + q_embed_22 = None + key_22 = k_embed_22.contiguous() + value_22 = value_states_45.contiguous() + attn_output_88 = torch._C._nn.scaled_dot_product_attention( + query_22, + key_22, + value_22, + attn_mask=attention_mask_23, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_22 = key_22 = value_22 = attention_mask_23 = None + transpose_92 = attn_output_88.transpose(1, 2) + attn_output_88 = None + attn_output_89 = transpose_92.contiguous() + transpose_92 = None + reshape_22 = attn_output_89.reshape(1, 2, -1) + attn_output_89 = None + attn_output_90 = reshape_22.contiguous() + reshape_22 = None + attn_output_91 = torch._C._nn.linear( + attn_output_90, + l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_90 = l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_44 = torch.nn.functional.dropout(attn_output_91, 0.0, False, False) + attn_output_91 = None + hidden_states_201 = hidden_states_197 + dropout_44 + hidden_states_197 = dropout_44 = None + hidden_states_202 = hidden_states_201.to(torch.float32) + pow_46 = hidden_states_202.pow(2) + variance_45 = pow_46.mean(-1, keepdim=True) + pow_46 = None + add_136 = variance_45 + 1e-05 + variance_45 = None + rsqrt_45 = torch.rsqrt(add_136) + add_136 = None + hidden_states_203 = hidden_states_202 * rsqrt_45 + hidden_states_202 = rsqrt_45 = None + to_95 = hidden_states_203.to(torch.bfloat16) + hidden_states_203 = None + hidden_states_204 = ( + l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ + * to_95 + ) + l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = ( + to_95 + ) = None + up_states_66 = torch._C._nn.linear( + hidden_states_204, + l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_204 = l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_22 = up_states_66.chunk(2, dim=-1) + up_states_66 = None + gate_22 = chunk_22[0] + up_states_67 = chunk_22[1] + chunk_22 = None + silu_22 = torch.nn.functional.silu(gate_22, inplace=False) + gate_22 = None + up_states_68 = up_states_67 * silu_22 + up_states_67 = silu_22 = None + hidden_states_205 = torch._C._nn.linear( + up_states_68, + l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_68 = l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_45 = torch.nn.functional.dropout(hidden_states_205, 0.0, False, False) + hidden_states_205 = None + hidden_states_206 = hidden_states_201 + dropout_45 + hidden_states_201 = dropout_45 = None + hidden_states_207 = hidden_states_206.to(torch.float32) + pow_47 = hidden_states_207.pow(2) + variance_46 = pow_47.mean(-1, keepdim=True) + pow_47 = None + add_138 = variance_46 + 1e-05 + variance_46 = None + rsqrt_46 = torch.rsqrt(add_138) + add_138 = None + hidden_states_208 = hidden_states_207 * rsqrt_46 + hidden_states_207 = rsqrt_46 = None + to_97 = hidden_states_208.to(torch.bfloat16) + hidden_states_208 = None + hidden_states_209 = ( + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + * to_97 + ) + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + to_97 + ) = None + qkv_23 = torch._C._nn.linear( + hidden_states_209, + l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_209 = l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_46 = qkv_23[(Ellipsis, slice(None, 3072, None))] + key_states_46 = qkv_23[(Ellipsis, slice(3072, 6144, None))] + value_states_46 = qkv_23[(Ellipsis, slice(6144, None, None))] + qkv_23 = None + view_70 = query_states_46.view((1, 2, -1, 96)) + query_states_46 = None + query_states_47 = view_70.transpose(1, 2) + view_70 = None + view_71 = key_states_46.view((1, 2, -1, 96)) + key_states_46 = None + key_states_47 = view_71.transpose(1, 2) + view_71 = None + view_72 = value_states_46.view((1, 2, -1, 96)) + value_states_46 = None + value_states_47 = view_72.transpose(1, 2) + view_72 = None + cos_26 = cos_2.unsqueeze(1) + sin_26 = sin_2.unsqueeze(1) + q_rot_23 = query_states_47[(Ellipsis, slice(None, 96, None))] + q_pass_23 = query_states_47[(Ellipsis, slice(96, None, None))] + query_states_47 = None + k_rot_23 = key_states_47[(Ellipsis, slice(None, 96, None))] + k_pass_23 = key_states_47[(Ellipsis, slice(96, None, None))] + key_states_47 = None + mul_212 = q_rot_23 * cos_26 + x1_46 = q_rot_23[(Ellipsis, slice(None, 48, None))] + x2_46 = q_rot_23[(Ellipsis, slice(48, None, None))] + q_rot_23 = None + neg_46 = -x2_46 + x2_46 = None + cat_93 = torch.cat((neg_46, x1_46), dim=-1) + neg_46 = x1_46 = None + mul_213 = cat_93 * sin_26 + cat_93 = None + add_139 = mul_212 + mul_213 + mul_212 = mul_213 = None + q_embed_23 = torch.cat([add_139, q_pass_23], dim=-1) + add_139 = q_pass_23 = None + mul_214 = k_rot_23 * cos_26 + cos_26 = None + x1_47 = k_rot_23[(Ellipsis, slice(None, 48, None))] + x2_47 = k_rot_23[(Ellipsis, slice(48, None, None))] + k_rot_23 = None + neg_47 = -x2_47 + x2_47 = None + cat_95 = torch.cat((neg_47, x1_47), dim=-1) + neg_47 = x1_47 = None + mul_215 = cat_95 * sin_26 + cat_95 = sin_26 = None + add_140 = mul_214 + mul_215 + mul_214 = mul_215 = None + k_embed_23 = torch.cat([add_140, k_pass_23], dim=-1) + add_140 = k_pass_23 = None + attention_mask_24 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_23 = q_embed_23.contiguous() + q_embed_23 = None + key_23 = k_embed_23.contiguous() + value_23 = value_states_47.contiguous() + attn_output_92 = torch._C._nn.scaled_dot_product_attention( + query_23, + key_23, + value_23, + attn_mask=attention_mask_24, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_23 = key_23 = value_23 = attention_mask_24 = None + transpose_96 = attn_output_92.transpose(1, 2) + attn_output_92 = None + attn_output_93 = transpose_96.contiguous() + transpose_96 = None + reshape_23 = attn_output_93.reshape(1, 2, -1) + attn_output_93 = None + attn_output_94 = reshape_23.contiguous() + reshape_23 = None + attn_output_95 = torch._C._nn.linear( + attn_output_94, + l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_94 = l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_46 = torch.nn.functional.dropout(attn_output_95, 0.0, False, False) + attn_output_95 = None + hidden_states_210 = hidden_states_206 + dropout_46 + hidden_states_206 = dropout_46 = None + hidden_states_211 = hidden_states_210.to(torch.float32) + pow_48 = hidden_states_211.pow(2) + variance_47 = pow_48.mean(-1, keepdim=True) + pow_48 = None + add_142 = variance_47 + 1e-05 + variance_47 = None + rsqrt_47 = torch.rsqrt(add_142) + add_142 = None + hidden_states_212 = hidden_states_211 * rsqrt_47 + hidden_states_211 = rsqrt_47 = None + to_99 = hidden_states_212.to(torch.bfloat16) + hidden_states_212 = None + hidden_states_213 = ( + l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ + * to_99 + ) + l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = ( + to_99 + ) = None + up_states_69 = torch._C._nn.linear( + hidden_states_213, + l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_213 = l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_23 = up_states_69.chunk(2, dim=-1) + up_states_69 = None + gate_23 = chunk_23[0] + up_states_70 = chunk_23[1] + chunk_23 = None + silu_23 = torch.nn.functional.silu(gate_23, inplace=False) + gate_23 = None + up_states_71 = up_states_70 * silu_23 + up_states_70 = silu_23 = None + hidden_states_214 = torch._C._nn.linear( + up_states_71, + l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_71 = l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_47 = torch.nn.functional.dropout(hidden_states_214, 0.0, False, False) + hidden_states_214 = None + hidden_states_215 = hidden_states_210 + dropout_47 + hidden_states_210 = dropout_47 = None + hidden_states_216 = hidden_states_215.to(torch.float32) + pow_49 = hidden_states_216.pow(2) + variance_48 = pow_49.mean(-1, keepdim=True) + pow_49 = None + add_144 = variance_48 + 1e-05 + variance_48 = None + rsqrt_48 = torch.rsqrt(add_144) + add_144 = None + hidden_states_217 = hidden_states_216 * rsqrt_48 + hidden_states_216 = rsqrt_48 = None + to_101 = hidden_states_217.to(torch.bfloat16) + hidden_states_217 = None + hidden_states_218 = ( + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ + * to_101 + ) + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ = ( + to_101 + ) = None + qkv_24 = torch._C._nn.linear( + hidden_states_218, + l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_218 = l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_48 = qkv_24[(Ellipsis, slice(None, 3072, None))] + key_states_48 = qkv_24[(Ellipsis, slice(3072, 6144, None))] + value_states_48 = qkv_24[(Ellipsis, slice(6144, None, None))] + qkv_24 = None + view_73 = query_states_48.view((1, 2, -1, 96)) + query_states_48 = None + query_states_49 = view_73.transpose(1, 2) + view_73 = None + view_74 = key_states_48.view((1, 2, -1, 96)) + key_states_48 = None + key_states_49 = view_74.transpose(1, 2) + view_74 = None + view_75 = value_states_48.view((1, 2, -1, 96)) + value_states_48 = None + value_states_49 = view_75.transpose(1, 2) + view_75 = None + cos_27 = cos_2.unsqueeze(1) + sin_27 = sin_2.unsqueeze(1) + q_rot_24 = query_states_49[(Ellipsis, slice(None, 96, None))] + q_pass_24 = query_states_49[(Ellipsis, slice(96, None, None))] + query_states_49 = None + k_rot_24 = key_states_49[(Ellipsis, slice(None, 96, None))] + k_pass_24 = key_states_49[(Ellipsis, slice(96, None, None))] + key_states_49 = None + mul_221 = q_rot_24 * cos_27 + x1_48 = q_rot_24[(Ellipsis, slice(None, 48, None))] + x2_48 = q_rot_24[(Ellipsis, slice(48, None, None))] + q_rot_24 = None + neg_48 = -x2_48 + x2_48 = None + cat_97 = torch.cat((neg_48, x1_48), dim=-1) + neg_48 = x1_48 = None + mul_222 = cat_97 * sin_27 + cat_97 = None + add_145 = mul_221 + mul_222 + mul_221 = mul_222 = None + q_embed_24 = torch.cat([add_145, q_pass_24], dim=-1) + add_145 = q_pass_24 = None + mul_223 = k_rot_24 * cos_27 + cos_27 = None + x1_49 = k_rot_24[(Ellipsis, slice(None, 48, None))] + x2_49 = k_rot_24[(Ellipsis, slice(48, None, None))] + k_rot_24 = None + neg_49 = -x2_49 + x2_49 = None + cat_99 = torch.cat((neg_49, x1_49), dim=-1) + neg_49 = x1_49 = None + mul_224 = cat_99 * sin_27 + cat_99 = sin_27 = None + add_146 = mul_223 + mul_224 + mul_223 = mul_224 = None + k_embed_24 = torch.cat([add_146, k_pass_24], dim=-1) + add_146 = k_pass_24 = None + attention_mask_25 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_24 = q_embed_24.contiguous() + q_embed_24 = None + key_24 = k_embed_24.contiguous() + value_24 = value_states_49.contiguous() + attn_output_96 = torch._C._nn.scaled_dot_product_attention( + query_24, + key_24, + value_24, + attn_mask=attention_mask_25, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_24 = key_24 = value_24 = attention_mask_25 = None + transpose_100 = attn_output_96.transpose(1, 2) + attn_output_96 = None + attn_output_97 = transpose_100.contiguous() + transpose_100 = None + reshape_24 = attn_output_97.reshape(1, 2, -1) + attn_output_97 = None + attn_output_98 = reshape_24.contiguous() + reshape_24 = None + attn_output_99 = torch._C._nn.linear( + attn_output_98, + l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_98 = l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_48 = torch.nn.functional.dropout(attn_output_99, 0.0, False, False) + attn_output_99 = None + hidden_states_219 = hidden_states_215 + dropout_48 + hidden_states_215 = dropout_48 = None + hidden_states_220 = hidden_states_219.to(torch.float32) + pow_50 = hidden_states_220.pow(2) + variance_49 = pow_50.mean(-1, keepdim=True) + pow_50 = None + add_148 = variance_49 + 1e-05 + variance_49 = None + rsqrt_49 = torch.rsqrt(add_148) + add_148 = None + hidden_states_221 = hidden_states_220 * rsqrt_49 + hidden_states_220 = rsqrt_49 = None + to_103 = hidden_states_221.to(torch.bfloat16) + hidden_states_221 = None + hidden_states_222 = ( + l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ + * to_103 + ) + l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ = ( + to_103 + ) = None + up_states_72 = torch._C._nn.linear( + hidden_states_222, + l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_222 = l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_24 = up_states_72.chunk(2, dim=-1) + up_states_72 = None + gate_24 = chunk_24[0] + up_states_73 = chunk_24[1] + chunk_24 = None + silu_24 = torch.nn.functional.silu(gate_24, inplace=False) + gate_24 = None + up_states_74 = up_states_73 * silu_24 + up_states_73 = silu_24 = None + hidden_states_223 = torch._C._nn.linear( + up_states_74, + l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_74 = l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_49 = torch.nn.functional.dropout(hidden_states_223, 0.0, False, False) + hidden_states_223 = None + hidden_states_224 = hidden_states_219 + dropout_49 + hidden_states_219 = dropout_49 = None + hidden_states_225 = hidden_states_224.to(torch.float32) + pow_51 = hidden_states_225.pow(2) + variance_50 = pow_51.mean(-1, keepdim=True) + pow_51 = None + add_150 = variance_50 + 1e-05 + variance_50 = None + rsqrt_50 = torch.rsqrt(add_150) + add_150 = None + hidden_states_226 = hidden_states_225 * rsqrt_50 + hidden_states_225 = rsqrt_50 = None + to_105 = hidden_states_226.to(torch.bfloat16) + hidden_states_226 = None + hidden_states_227 = ( + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ + * to_105 + ) + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ = ( + to_105 + ) = None + qkv_25 = torch._C._nn.linear( + hidden_states_227, + l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_227 = l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_50 = qkv_25[(Ellipsis, slice(None, 3072, None))] + key_states_50 = qkv_25[(Ellipsis, slice(3072, 6144, None))] + value_states_50 = qkv_25[(Ellipsis, slice(6144, None, None))] + qkv_25 = None + view_76 = query_states_50.view((1, 2, -1, 96)) + query_states_50 = None + query_states_51 = view_76.transpose(1, 2) + view_76 = None + view_77 = key_states_50.view((1, 2, -1, 96)) + key_states_50 = None + key_states_51 = view_77.transpose(1, 2) + view_77 = None + view_78 = value_states_50.view((1, 2, -1, 96)) + value_states_50 = None + value_states_51 = view_78.transpose(1, 2) + view_78 = None + cos_28 = cos_2.unsqueeze(1) + sin_28 = sin_2.unsqueeze(1) + q_rot_25 = query_states_51[(Ellipsis, slice(None, 96, None))] + q_pass_25 = query_states_51[(Ellipsis, slice(96, None, None))] + query_states_51 = None + k_rot_25 = key_states_51[(Ellipsis, slice(None, 96, None))] + k_pass_25 = key_states_51[(Ellipsis, slice(96, None, None))] + key_states_51 = None + mul_230 = q_rot_25 * cos_28 + x1_50 = q_rot_25[(Ellipsis, slice(None, 48, None))] + x2_50 = q_rot_25[(Ellipsis, slice(48, None, None))] + q_rot_25 = None + neg_50 = -x2_50 + x2_50 = None + cat_101 = torch.cat((neg_50, x1_50), dim=-1) + neg_50 = x1_50 = None + mul_231 = cat_101 * sin_28 + cat_101 = None + add_151 = mul_230 + mul_231 + mul_230 = mul_231 = None + q_embed_25 = torch.cat([add_151, q_pass_25], dim=-1) + add_151 = q_pass_25 = None + mul_232 = k_rot_25 * cos_28 + cos_28 = None + x1_51 = k_rot_25[(Ellipsis, slice(None, 48, None))] + x2_51 = k_rot_25[(Ellipsis, slice(48, None, None))] + k_rot_25 = None + neg_51 = -x2_51 + x2_51 = None + cat_103 = torch.cat((neg_51, x1_51), dim=-1) + neg_51 = x1_51 = None + mul_233 = cat_103 * sin_28 + cat_103 = sin_28 = None + add_152 = mul_232 + mul_233 + mul_232 = mul_233 = None + k_embed_25 = torch.cat([add_152, k_pass_25], dim=-1) + add_152 = k_pass_25 = None + attention_mask_26 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_25 = q_embed_25.contiguous() + q_embed_25 = None + key_25 = k_embed_25.contiguous() + value_25 = value_states_51.contiguous() + attn_output_100 = torch._C._nn.scaled_dot_product_attention( + query_25, + key_25, + value_25, + attn_mask=attention_mask_26, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_25 = key_25 = value_25 = attention_mask_26 = None + transpose_104 = attn_output_100.transpose(1, 2) + attn_output_100 = None + attn_output_101 = transpose_104.contiguous() + transpose_104 = None + reshape_25 = attn_output_101.reshape(1, 2, -1) + attn_output_101 = None + attn_output_102 = reshape_25.contiguous() + reshape_25 = None + attn_output_103 = torch._C._nn.linear( + attn_output_102, + l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_102 = l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_50 = torch.nn.functional.dropout(attn_output_103, 0.0, False, False) + attn_output_103 = None + hidden_states_228 = hidden_states_224 + dropout_50 + hidden_states_224 = dropout_50 = None + hidden_states_229 = hidden_states_228.to(torch.float32) + pow_52 = hidden_states_229.pow(2) + variance_51 = pow_52.mean(-1, keepdim=True) + pow_52 = None + add_154 = variance_51 + 1e-05 + variance_51 = None + rsqrt_51 = torch.rsqrt(add_154) + add_154 = None + hidden_states_230 = hidden_states_229 * rsqrt_51 + hidden_states_229 = rsqrt_51 = None + to_107 = hidden_states_230.to(torch.bfloat16) + hidden_states_230 = None + hidden_states_231 = ( + l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ + * to_107 + ) + l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ = ( + to_107 + ) = None + up_states_75 = torch._C._nn.linear( + hidden_states_231, + l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_231 = l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_25 = up_states_75.chunk(2, dim=-1) + up_states_75 = None + gate_25 = chunk_25[0] + up_states_76 = chunk_25[1] + chunk_25 = None + silu_25 = torch.nn.functional.silu(gate_25, inplace=False) + gate_25 = None + up_states_77 = up_states_76 * silu_25 + up_states_76 = silu_25 = None + hidden_states_232 = torch._C._nn.linear( + up_states_77, + l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_77 = l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_51 = torch.nn.functional.dropout(hidden_states_232, 0.0, False, False) + hidden_states_232 = None + hidden_states_233 = hidden_states_228 + dropout_51 + hidden_states_228 = dropout_51 = None + hidden_states_234 = hidden_states_233.to(torch.float32) + pow_53 = hidden_states_234.pow(2) + variance_52 = pow_53.mean(-1, keepdim=True) + pow_53 = None + add_156 = variance_52 + 1e-05 + variance_52 = None + rsqrt_52 = torch.rsqrt(add_156) + add_156 = None + hidden_states_235 = hidden_states_234 * rsqrt_52 + hidden_states_234 = rsqrt_52 = None + to_109 = hidden_states_235.to(torch.bfloat16) + hidden_states_235 = None + hidden_states_236 = ( + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ + * to_109 + ) + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ = ( + to_109 + ) = None + qkv_26 = torch._C._nn.linear( + hidden_states_236, + l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_236 = l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_52 = qkv_26[(Ellipsis, slice(None, 3072, None))] + key_states_52 = qkv_26[(Ellipsis, slice(3072, 6144, None))] + value_states_52 = qkv_26[(Ellipsis, slice(6144, None, None))] + qkv_26 = None + view_79 = query_states_52.view((1, 2, -1, 96)) + query_states_52 = None + query_states_53 = view_79.transpose(1, 2) + view_79 = None + view_80 = key_states_52.view((1, 2, -1, 96)) + key_states_52 = None + key_states_53 = view_80.transpose(1, 2) + view_80 = None + view_81 = value_states_52.view((1, 2, -1, 96)) + value_states_52 = None + value_states_53 = view_81.transpose(1, 2) + view_81 = None + cos_29 = cos_2.unsqueeze(1) + sin_29 = sin_2.unsqueeze(1) + q_rot_26 = query_states_53[(Ellipsis, slice(None, 96, None))] + q_pass_26 = query_states_53[(Ellipsis, slice(96, None, None))] + query_states_53 = None + k_rot_26 = key_states_53[(Ellipsis, slice(None, 96, None))] + k_pass_26 = key_states_53[(Ellipsis, slice(96, None, None))] + key_states_53 = None + mul_239 = q_rot_26 * cos_29 + x1_52 = q_rot_26[(Ellipsis, slice(None, 48, None))] + x2_52 = q_rot_26[(Ellipsis, slice(48, None, None))] + q_rot_26 = None + neg_52 = -x2_52 + x2_52 = None + cat_105 = torch.cat((neg_52, x1_52), dim=-1) + neg_52 = x1_52 = None + mul_240 = cat_105 * sin_29 + cat_105 = None + add_157 = mul_239 + mul_240 + mul_239 = mul_240 = None + q_embed_26 = torch.cat([add_157, q_pass_26], dim=-1) + add_157 = q_pass_26 = None + mul_241 = k_rot_26 * cos_29 + cos_29 = None + x1_53 = k_rot_26[(Ellipsis, slice(None, 48, None))] + x2_53 = k_rot_26[(Ellipsis, slice(48, None, None))] + k_rot_26 = None + neg_53 = -x2_53 + x2_53 = None + cat_107 = torch.cat((neg_53, x1_53), dim=-1) + neg_53 = x1_53 = None + mul_242 = cat_107 * sin_29 + cat_107 = sin_29 = None + add_158 = mul_241 + mul_242 + mul_241 = mul_242 = None + k_embed_26 = torch.cat([add_158, k_pass_26], dim=-1) + add_158 = k_pass_26 = None + attention_mask_27 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_26 = q_embed_26.contiguous() + q_embed_26 = None + key_26 = k_embed_26.contiguous() + value_26 = value_states_53.contiguous() + attn_output_104 = torch._C._nn.scaled_dot_product_attention( + query_26, + key_26, + value_26, + attn_mask=attention_mask_27, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_26 = key_26 = value_26 = attention_mask_27 = None + transpose_108 = attn_output_104.transpose(1, 2) + attn_output_104 = None + attn_output_105 = transpose_108.contiguous() + transpose_108 = None + reshape_26 = attn_output_105.reshape(1, 2, -1) + attn_output_105 = None + attn_output_106 = reshape_26.contiguous() + reshape_26 = None + attn_output_107 = torch._C._nn.linear( + attn_output_106, + l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_106 = l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_52 = torch.nn.functional.dropout(attn_output_107, 0.0, False, False) + attn_output_107 = None + hidden_states_237 = hidden_states_233 + dropout_52 + hidden_states_233 = dropout_52 = None + hidden_states_238 = hidden_states_237.to(torch.float32) + pow_54 = hidden_states_238.pow(2) + variance_53 = pow_54.mean(-1, keepdim=True) + pow_54 = None + add_160 = variance_53 + 1e-05 + variance_53 = None + rsqrt_53 = torch.rsqrt(add_160) + add_160 = None + hidden_states_239 = hidden_states_238 * rsqrt_53 + hidden_states_238 = rsqrt_53 = None + to_111 = hidden_states_239.to(torch.bfloat16) + hidden_states_239 = None + hidden_states_240 = ( + l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ + * to_111 + ) + l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ = ( + to_111 + ) = None + up_states_78 = torch._C._nn.linear( + hidden_states_240, + l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_240 = l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_26 = up_states_78.chunk(2, dim=-1) + up_states_78 = None + gate_26 = chunk_26[0] + up_states_79 = chunk_26[1] + chunk_26 = None + silu_26 = torch.nn.functional.silu(gate_26, inplace=False) + gate_26 = None + up_states_80 = up_states_79 * silu_26 + up_states_79 = silu_26 = None + hidden_states_241 = torch._C._nn.linear( + up_states_80, + l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_80 = l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_53 = torch.nn.functional.dropout(hidden_states_241, 0.0, False, False) + hidden_states_241 = None + hidden_states_242 = hidden_states_237 + dropout_53 + hidden_states_237 = dropout_53 = None + hidden_states_243 = hidden_states_242.to(torch.float32) + pow_55 = hidden_states_243.pow(2) + variance_54 = pow_55.mean(-1, keepdim=True) + pow_55 = None + add_162 = variance_54 + 1e-05 + variance_54 = None + rsqrt_54 = torch.rsqrt(add_162) + add_162 = None + hidden_states_244 = hidden_states_243 * rsqrt_54 + hidden_states_243 = rsqrt_54 = None + to_113 = hidden_states_244.to(torch.bfloat16) + hidden_states_244 = None + hidden_states_245 = ( + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ + * to_113 + ) + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ = ( + to_113 + ) = None + qkv_27 = torch._C._nn.linear( + hidden_states_245, + l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_245 = l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_54 = qkv_27[(Ellipsis, slice(None, 3072, None))] + key_states_54 = qkv_27[(Ellipsis, slice(3072, 6144, None))] + value_states_54 = qkv_27[(Ellipsis, slice(6144, None, None))] + qkv_27 = None + view_82 = query_states_54.view((1, 2, -1, 96)) + query_states_54 = None + query_states_55 = view_82.transpose(1, 2) + view_82 = None + view_83 = key_states_54.view((1, 2, -1, 96)) + key_states_54 = None + key_states_55 = view_83.transpose(1, 2) + view_83 = None + view_84 = value_states_54.view((1, 2, -1, 96)) + value_states_54 = None + value_states_55 = view_84.transpose(1, 2) + view_84 = None + cos_30 = cos_2.unsqueeze(1) + sin_30 = sin_2.unsqueeze(1) + q_rot_27 = query_states_55[(Ellipsis, slice(None, 96, None))] + q_pass_27 = query_states_55[(Ellipsis, slice(96, None, None))] + query_states_55 = None + k_rot_27 = key_states_55[(Ellipsis, slice(None, 96, None))] + k_pass_27 = key_states_55[(Ellipsis, slice(96, None, None))] + key_states_55 = None + mul_248 = q_rot_27 * cos_30 + x1_54 = q_rot_27[(Ellipsis, slice(None, 48, None))] + x2_54 = q_rot_27[(Ellipsis, slice(48, None, None))] + q_rot_27 = None + neg_54 = -x2_54 + x2_54 = None + cat_109 = torch.cat((neg_54, x1_54), dim=-1) + neg_54 = x1_54 = None + mul_249 = cat_109 * sin_30 + cat_109 = None + add_163 = mul_248 + mul_249 + mul_248 = mul_249 = None + q_embed_27 = torch.cat([add_163, q_pass_27], dim=-1) + add_163 = q_pass_27 = None + mul_250 = k_rot_27 * cos_30 + cos_30 = None + x1_55 = k_rot_27[(Ellipsis, slice(None, 48, None))] + x2_55 = k_rot_27[(Ellipsis, slice(48, None, None))] + k_rot_27 = None + neg_55 = -x2_55 + x2_55 = None + cat_111 = torch.cat((neg_55, x1_55), dim=-1) + neg_55 = x1_55 = None + mul_251 = cat_111 * sin_30 + cat_111 = sin_30 = None + add_164 = mul_250 + mul_251 + mul_250 = mul_251 = None + k_embed_27 = torch.cat([add_164, k_pass_27], dim=-1) + add_164 = k_pass_27 = None + attention_mask_28 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_27 = q_embed_27.contiguous() + q_embed_27 = None + key_27 = k_embed_27.contiguous() + value_27 = value_states_55.contiguous() + attn_output_108 = torch._C._nn.scaled_dot_product_attention( + query_27, + key_27, + value_27, + attn_mask=attention_mask_28, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_27 = key_27 = value_27 = attention_mask_28 = None + transpose_112 = attn_output_108.transpose(1, 2) + attn_output_108 = None + attn_output_109 = transpose_112.contiguous() + transpose_112 = None + reshape_27 = attn_output_109.reshape(1, 2, -1) + attn_output_109 = None + attn_output_110 = reshape_27.contiguous() + reshape_27 = None + attn_output_111 = torch._C._nn.linear( + attn_output_110, + l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_110 = l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_54 = torch.nn.functional.dropout(attn_output_111, 0.0, False, False) + attn_output_111 = None + hidden_states_246 = hidden_states_242 + dropout_54 + hidden_states_242 = dropout_54 = None + hidden_states_247 = hidden_states_246.to(torch.float32) + pow_56 = hidden_states_247.pow(2) + variance_55 = pow_56.mean(-1, keepdim=True) + pow_56 = None + add_166 = variance_55 + 1e-05 + variance_55 = None + rsqrt_55 = torch.rsqrt(add_166) + add_166 = None + hidden_states_248 = hidden_states_247 * rsqrt_55 + hidden_states_247 = rsqrt_55 = None + to_115 = hidden_states_248.to(torch.bfloat16) + hidden_states_248 = None + hidden_states_249 = ( + l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ + * to_115 + ) + l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ = ( + to_115 + ) = None + up_states_81 = torch._C._nn.linear( + hidden_states_249, + l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_249 = l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_27 = up_states_81.chunk(2, dim=-1) + up_states_81 = None + gate_27 = chunk_27[0] + up_states_82 = chunk_27[1] + chunk_27 = None + silu_27 = torch.nn.functional.silu(gate_27, inplace=False) + gate_27 = None + up_states_83 = up_states_82 * silu_27 + up_states_82 = silu_27 = None + hidden_states_250 = torch._C._nn.linear( + up_states_83, + l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_83 = l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_55 = torch.nn.functional.dropout(hidden_states_250, 0.0, False, False) + hidden_states_250 = None + hidden_states_251 = hidden_states_246 + dropout_55 + hidden_states_246 = dropout_55 = None + hidden_states_252 = hidden_states_251.to(torch.float32) + pow_57 = hidden_states_252.pow(2) + variance_56 = pow_57.mean(-1, keepdim=True) + pow_57 = None + add_168 = variance_56 + 1e-05 + variance_56 = None + rsqrt_56 = torch.rsqrt(add_168) + add_168 = None + hidden_states_253 = hidden_states_252 * rsqrt_56 + hidden_states_252 = rsqrt_56 = None + to_117 = hidden_states_253.to(torch.bfloat16) + hidden_states_253 = None + hidden_states_254 = ( + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ + * to_117 + ) + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ = ( + to_117 + ) = None + qkv_28 = torch._C._nn.linear( + hidden_states_254, + l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_254 = l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_56 = qkv_28[(Ellipsis, slice(None, 3072, None))] + key_states_56 = qkv_28[(Ellipsis, slice(3072, 6144, None))] + value_states_56 = qkv_28[(Ellipsis, slice(6144, None, None))] + qkv_28 = None + view_85 = query_states_56.view((1, 2, -1, 96)) + query_states_56 = None + query_states_57 = view_85.transpose(1, 2) + view_85 = None + view_86 = key_states_56.view((1, 2, -1, 96)) + key_states_56 = None + key_states_57 = view_86.transpose(1, 2) + view_86 = None + view_87 = value_states_56.view((1, 2, -1, 96)) + value_states_56 = None + value_states_57 = view_87.transpose(1, 2) + view_87 = None + cos_31 = cos_2.unsqueeze(1) + sin_31 = sin_2.unsqueeze(1) + q_rot_28 = query_states_57[(Ellipsis, slice(None, 96, None))] + q_pass_28 = query_states_57[(Ellipsis, slice(96, None, None))] + query_states_57 = None + k_rot_28 = key_states_57[(Ellipsis, slice(None, 96, None))] + k_pass_28 = key_states_57[(Ellipsis, slice(96, None, None))] + key_states_57 = None + mul_257 = q_rot_28 * cos_31 + x1_56 = q_rot_28[(Ellipsis, slice(None, 48, None))] + x2_56 = q_rot_28[(Ellipsis, slice(48, None, None))] + q_rot_28 = None + neg_56 = -x2_56 + x2_56 = None + cat_113 = torch.cat((neg_56, x1_56), dim=-1) + neg_56 = x1_56 = None + mul_258 = cat_113 * sin_31 + cat_113 = None + add_169 = mul_257 + mul_258 + mul_257 = mul_258 = None + q_embed_28 = torch.cat([add_169, q_pass_28], dim=-1) + add_169 = q_pass_28 = None + mul_259 = k_rot_28 * cos_31 + cos_31 = None + x1_57 = k_rot_28[(Ellipsis, slice(None, 48, None))] + x2_57 = k_rot_28[(Ellipsis, slice(48, None, None))] + k_rot_28 = None + neg_57 = -x2_57 + x2_57 = None + cat_115 = torch.cat((neg_57, x1_57), dim=-1) + neg_57 = x1_57 = None + mul_260 = cat_115 * sin_31 + cat_115 = sin_31 = None + add_170 = mul_259 + mul_260 + mul_259 = mul_260 = None + k_embed_28 = torch.cat([add_170, k_pass_28], dim=-1) + add_170 = k_pass_28 = None + attention_mask_29 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_28 = q_embed_28.contiguous() + q_embed_28 = None + key_28 = k_embed_28.contiguous() + value_28 = value_states_57.contiguous() + attn_output_112 = torch._C._nn.scaled_dot_product_attention( + query_28, + key_28, + value_28, + attn_mask=attention_mask_29, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_28 = key_28 = value_28 = attention_mask_29 = None + transpose_116 = attn_output_112.transpose(1, 2) + attn_output_112 = None + attn_output_113 = transpose_116.contiguous() + transpose_116 = None + reshape_28 = attn_output_113.reshape(1, 2, -1) + attn_output_113 = None + attn_output_114 = reshape_28.contiguous() + reshape_28 = None + attn_output_115 = torch._C._nn.linear( + attn_output_114, + l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_114 = l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_56 = torch.nn.functional.dropout(attn_output_115, 0.0, False, False) + attn_output_115 = None + hidden_states_255 = hidden_states_251 + dropout_56 + hidden_states_251 = dropout_56 = None + hidden_states_256 = hidden_states_255.to(torch.float32) + pow_58 = hidden_states_256.pow(2) + variance_57 = pow_58.mean(-1, keepdim=True) + pow_58 = None + add_172 = variance_57 + 1e-05 + variance_57 = None + rsqrt_57 = torch.rsqrt(add_172) + add_172 = None + hidden_states_257 = hidden_states_256 * rsqrt_57 + hidden_states_256 = rsqrt_57 = None + to_119 = hidden_states_257.to(torch.bfloat16) + hidden_states_257 = None + hidden_states_258 = ( + l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ + * to_119 + ) + l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ = ( + to_119 + ) = None + up_states_84 = torch._C._nn.linear( + hidden_states_258, + l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_258 = l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_28 = up_states_84.chunk(2, dim=-1) + up_states_84 = None + gate_28 = chunk_28[0] + up_states_85 = chunk_28[1] + chunk_28 = None + silu_28 = torch.nn.functional.silu(gate_28, inplace=False) + gate_28 = None + up_states_86 = up_states_85 * silu_28 + up_states_85 = silu_28 = None + hidden_states_259 = torch._C._nn.linear( + up_states_86, + l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_86 = l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_57 = torch.nn.functional.dropout(hidden_states_259, 0.0, False, False) + hidden_states_259 = None + hidden_states_260 = hidden_states_255 + dropout_57 + hidden_states_255 = dropout_57 = None + hidden_states_261 = hidden_states_260.to(torch.float32) + pow_59 = hidden_states_261.pow(2) + variance_58 = pow_59.mean(-1, keepdim=True) + pow_59 = None + add_174 = variance_58 + 1e-05 + variance_58 = None + rsqrt_58 = torch.rsqrt(add_174) + add_174 = None + hidden_states_262 = hidden_states_261 * rsqrt_58 + hidden_states_261 = rsqrt_58 = None + to_121 = hidden_states_262.to(torch.bfloat16) + hidden_states_262 = None + hidden_states_263 = ( + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ + * to_121 + ) + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ = ( + to_121 + ) = None + qkv_29 = torch._C._nn.linear( + hidden_states_263, + l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_263 = l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_58 = qkv_29[(Ellipsis, slice(None, 3072, None))] + key_states_58 = qkv_29[(Ellipsis, slice(3072, 6144, None))] + value_states_58 = qkv_29[(Ellipsis, slice(6144, None, None))] + qkv_29 = None + view_88 = query_states_58.view((1, 2, -1, 96)) + query_states_58 = None + query_states_59 = view_88.transpose(1, 2) + view_88 = None + view_89 = key_states_58.view((1, 2, -1, 96)) + key_states_58 = None + key_states_59 = view_89.transpose(1, 2) + view_89 = None + view_90 = value_states_58.view((1, 2, -1, 96)) + value_states_58 = None + value_states_59 = view_90.transpose(1, 2) + view_90 = None + cos_32 = cos_2.unsqueeze(1) + sin_32 = sin_2.unsqueeze(1) + q_rot_29 = query_states_59[(Ellipsis, slice(None, 96, None))] + q_pass_29 = query_states_59[(Ellipsis, slice(96, None, None))] + query_states_59 = None + k_rot_29 = key_states_59[(Ellipsis, slice(None, 96, None))] + k_pass_29 = key_states_59[(Ellipsis, slice(96, None, None))] + key_states_59 = None + mul_266 = q_rot_29 * cos_32 + x1_58 = q_rot_29[(Ellipsis, slice(None, 48, None))] + x2_58 = q_rot_29[(Ellipsis, slice(48, None, None))] + q_rot_29 = None + neg_58 = -x2_58 + x2_58 = None + cat_117 = torch.cat((neg_58, x1_58), dim=-1) + neg_58 = x1_58 = None + mul_267 = cat_117 * sin_32 + cat_117 = None + add_175 = mul_266 + mul_267 + mul_266 = mul_267 = None + q_embed_29 = torch.cat([add_175, q_pass_29], dim=-1) + add_175 = q_pass_29 = None + mul_268 = k_rot_29 * cos_32 + cos_32 = None + x1_59 = k_rot_29[(Ellipsis, slice(None, 48, None))] + x2_59 = k_rot_29[(Ellipsis, slice(48, None, None))] + k_rot_29 = None + neg_59 = -x2_59 + x2_59 = None + cat_119 = torch.cat((neg_59, x1_59), dim=-1) + neg_59 = x1_59 = None + mul_269 = cat_119 * sin_32 + cat_119 = sin_32 = None + add_176 = mul_268 + mul_269 + mul_268 = mul_269 = None + k_embed_29 = torch.cat([add_176, k_pass_29], dim=-1) + add_176 = k_pass_29 = None + attention_mask_30 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_29 = q_embed_29.contiguous() + q_embed_29 = None + key_29 = k_embed_29.contiguous() + value_29 = value_states_59.contiguous() + attn_output_116 = torch._C._nn.scaled_dot_product_attention( + query_29, + key_29, + value_29, + attn_mask=attention_mask_30, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_29 = key_29 = value_29 = attention_mask_30 = None + transpose_120 = attn_output_116.transpose(1, 2) + attn_output_116 = None + attn_output_117 = transpose_120.contiguous() + transpose_120 = None + reshape_29 = attn_output_117.reshape(1, 2, -1) + attn_output_117 = None + attn_output_118 = reshape_29.contiguous() + reshape_29 = None + attn_output_119 = torch._C._nn.linear( + attn_output_118, + l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_118 = l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_58 = torch.nn.functional.dropout(attn_output_119, 0.0, False, False) + attn_output_119 = None + hidden_states_264 = hidden_states_260 + dropout_58 + hidden_states_260 = dropout_58 = None + hidden_states_265 = hidden_states_264.to(torch.float32) + pow_60 = hidden_states_265.pow(2) + variance_59 = pow_60.mean(-1, keepdim=True) + pow_60 = None + add_178 = variance_59 + 1e-05 + variance_59 = None + rsqrt_59 = torch.rsqrt(add_178) + add_178 = None + hidden_states_266 = hidden_states_265 * rsqrt_59 + hidden_states_265 = rsqrt_59 = None + to_123 = hidden_states_266.to(torch.bfloat16) + hidden_states_266 = None + hidden_states_267 = ( + l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ + * to_123 + ) + l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ = ( + to_123 + ) = None + up_states_87 = torch._C._nn.linear( + hidden_states_267, + l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_267 = l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_29 = up_states_87.chunk(2, dim=-1) + up_states_87 = None + gate_29 = chunk_29[0] + up_states_88 = chunk_29[1] + chunk_29 = None + silu_29 = torch.nn.functional.silu(gate_29, inplace=False) + gate_29 = None + up_states_89 = up_states_88 * silu_29 + up_states_88 = silu_29 = None + hidden_states_268 = torch._C._nn.linear( + up_states_89, + l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_89 = l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_59 = torch.nn.functional.dropout(hidden_states_268, 0.0, False, False) + hidden_states_268 = None + hidden_states_269 = hidden_states_264 + dropout_59 + hidden_states_264 = dropout_59 = None + hidden_states_270 = hidden_states_269.to(torch.float32) + pow_61 = hidden_states_270.pow(2) + variance_60 = pow_61.mean(-1, keepdim=True) + pow_61 = None + add_180 = variance_60 + 1e-05 + variance_60 = None + rsqrt_60 = torch.rsqrt(add_180) + add_180 = None + hidden_states_271 = hidden_states_270 * rsqrt_60 + hidden_states_270 = rsqrt_60 = None + to_125 = hidden_states_271.to(torch.bfloat16) + hidden_states_271 = None + hidden_states_272 = ( + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ + * to_125 + ) + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ = ( + to_125 + ) = None + qkv_30 = torch._C._nn.linear( + hidden_states_272, + l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_272 = l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_60 = qkv_30[(Ellipsis, slice(None, 3072, None))] + key_states_60 = qkv_30[(Ellipsis, slice(3072, 6144, None))] + value_states_60 = qkv_30[(Ellipsis, slice(6144, None, None))] + qkv_30 = None + view_91 = query_states_60.view((1, 2, -1, 96)) + query_states_60 = None + query_states_61 = view_91.transpose(1, 2) + view_91 = None + view_92 = key_states_60.view((1, 2, -1, 96)) + key_states_60 = None + key_states_61 = view_92.transpose(1, 2) + view_92 = None + view_93 = value_states_60.view((1, 2, -1, 96)) + value_states_60 = None + value_states_61 = view_93.transpose(1, 2) + view_93 = None + cos_33 = cos_2.unsqueeze(1) + sin_33 = sin_2.unsqueeze(1) + q_rot_30 = query_states_61[(Ellipsis, slice(None, 96, None))] + q_pass_30 = query_states_61[(Ellipsis, slice(96, None, None))] + query_states_61 = None + k_rot_30 = key_states_61[(Ellipsis, slice(None, 96, None))] + k_pass_30 = key_states_61[(Ellipsis, slice(96, None, None))] + key_states_61 = None + mul_275 = q_rot_30 * cos_33 + x1_60 = q_rot_30[(Ellipsis, slice(None, 48, None))] + x2_60 = q_rot_30[(Ellipsis, slice(48, None, None))] + q_rot_30 = None + neg_60 = -x2_60 + x2_60 = None + cat_121 = torch.cat((neg_60, x1_60), dim=-1) + neg_60 = x1_60 = None + mul_276 = cat_121 * sin_33 + cat_121 = None + add_181 = mul_275 + mul_276 + mul_275 = mul_276 = None + q_embed_30 = torch.cat([add_181, q_pass_30], dim=-1) + add_181 = q_pass_30 = None + mul_277 = k_rot_30 * cos_33 + cos_33 = None + x1_61 = k_rot_30[(Ellipsis, slice(None, 48, None))] + x2_61 = k_rot_30[(Ellipsis, slice(48, None, None))] + k_rot_30 = None + neg_61 = -x2_61 + x2_61 = None + cat_123 = torch.cat((neg_61, x1_61), dim=-1) + neg_61 = x1_61 = None + mul_278 = cat_123 * sin_33 + cat_123 = sin_33 = None + add_182 = mul_277 + mul_278 + mul_277 = mul_278 = None + k_embed_30 = torch.cat([add_182, k_pass_30], dim=-1) + add_182 = k_pass_30 = None + attention_mask_31 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_30 = q_embed_30.contiguous() + q_embed_30 = None + key_30 = k_embed_30.contiguous() + value_30 = value_states_61.contiguous() + attn_output_120 = torch._C._nn.scaled_dot_product_attention( + query_30, + key_30, + value_30, + attn_mask=attention_mask_31, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_30 = key_30 = value_30 = attention_mask_31 = None + transpose_124 = attn_output_120.transpose(1, 2) + attn_output_120 = None + attn_output_121 = transpose_124.contiguous() + transpose_124 = None + reshape_30 = attn_output_121.reshape(1, 2, -1) + attn_output_121 = None + attn_output_122 = reshape_30.contiguous() + reshape_30 = None + attn_output_123 = torch._C._nn.linear( + attn_output_122, + l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_122 = l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_60 = torch.nn.functional.dropout(attn_output_123, 0.0, False, False) + attn_output_123 = None + hidden_states_273 = hidden_states_269 + dropout_60 + hidden_states_269 = dropout_60 = None + hidden_states_274 = hidden_states_273.to(torch.float32) + pow_62 = hidden_states_274.pow(2) + variance_61 = pow_62.mean(-1, keepdim=True) + pow_62 = None + add_184 = variance_61 + 1e-05 + variance_61 = None + rsqrt_61 = torch.rsqrt(add_184) + add_184 = None + hidden_states_275 = hidden_states_274 * rsqrt_61 + hidden_states_274 = rsqrt_61 = None + to_127 = hidden_states_275.to(torch.bfloat16) + hidden_states_275 = None + hidden_states_276 = ( + l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ + * to_127 + ) + l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ = ( + to_127 + ) = None + up_states_90 = torch._C._nn.linear( + hidden_states_276, + l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_276 = l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_30 = up_states_90.chunk(2, dim=-1) + up_states_90 = None + gate_30 = chunk_30[0] + up_states_91 = chunk_30[1] + chunk_30 = None + silu_30 = torch.nn.functional.silu(gate_30, inplace=False) + gate_30 = None + up_states_92 = up_states_91 * silu_30 + up_states_91 = silu_30 = None + hidden_states_277 = torch._C._nn.linear( + up_states_92, + l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_92 = l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_61 = torch.nn.functional.dropout(hidden_states_277, 0.0, False, False) + hidden_states_277 = None + hidden_states_278 = hidden_states_273 + dropout_61 + hidden_states_273 = dropout_61 = None + hidden_states_279 = hidden_states_278.to(torch.float32) + pow_63 = hidden_states_279.pow(2) + variance_62 = pow_63.mean(-1, keepdim=True) + pow_63 = None + add_186 = variance_62 + 1e-05 + variance_62 = None + rsqrt_62 = torch.rsqrt(add_186) + add_186 = None + hidden_states_280 = hidden_states_279 * rsqrt_62 + hidden_states_279 = rsqrt_62 = None + to_129 = hidden_states_280.to(torch.bfloat16) + hidden_states_280 = None + hidden_states_281 = ( + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ + * to_129 + ) + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ = ( + to_129 + ) = None + qkv_31 = torch._C._nn.linear( + hidden_states_281, + l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_281 = l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_62 = qkv_31[(Ellipsis, slice(None, 3072, None))] + key_states_62 = qkv_31[(Ellipsis, slice(3072, 6144, None))] + value_states_62 = qkv_31[(Ellipsis, slice(6144, None, None))] + qkv_31 = None + view_94 = query_states_62.view((1, 2, -1, 96)) + query_states_62 = None + query_states_63 = view_94.transpose(1, 2) + view_94 = None + view_95 = key_states_62.view((1, 2, -1, 96)) + key_states_62 = None + key_states_63 = view_95.transpose(1, 2) + view_95 = None + view_96 = value_states_62.view((1, 2, -1, 96)) + value_states_62 = None + value_states_63 = view_96.transpose(1, 2) + view_96 = None + cos_34 = cos_2.unsqueeze(1) + cos_2 = None + sin_34 = sin_2.unsqueeze(1) + sin_2 = None + q_rot_31 = query_states_63[(Ellipsis, slice(None, 96, None))] + q_pass_31 = query_states_63[(Ellipsis, slice(96, None, None))] + query_states_63 = None + k_rot_31 = key_states_63[(Ellipsis, slice(None, 96, None))] + k_pass_31 = key_states_63[(Ellipsis, slice(96, None, None))] + key_states_63 = None + mul_284 = q_rot_31 * cos_34 + x1_62 = q_rot_31[(Ellipsis, slice(None, 48, None))] + x2_62 = q_rot_31[(Ellipsis, slice(48, None, None))] + q_rot_31 = None + neg_62 = -x2_62 + x2_62 = None + cat_125 = torch.cat((neg_62, x1_62), dim=-1) + neg_62 = x1_62 = None + mul_285 = cat_125 * sin_34 + cat_125 = None + add_187 = mul_284 + mul_285 + mul_284 = mul_285 = None + q_embed_31 = torch.cat([add_187, q_pass_31], dim=-1) + add_187 = q_pass_31 = None + mul_286 = k_rot_31 * cos_34 + cos_34 = None + x1_63 = k_rot_31[(Ellipsis, slice(None, 48, None))] + x2_63 = k_rot_31[(Ellipsis, slice(48, None, None))] + k_rot_31 = None + neg_63 = -x2_63 + x2_63 = None + cat_127 = torch.cat((neg_63, x1_63), dim=-1) + neg_63 = x1_63 = None + mul_287 = cat_127 * sin_34 + cat_127 = sin_34 = None + add_188 = mul_286 + mul_287 + mul_286 = mul_287 = None + k_embed_31 = torch.cat([add_188, k_pass_31], dim=-1) + add_188 = k_pass_31 = None + attention_mask_32 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + causal_mask_3 = None + query_31 = q_embed_31.contiguous() + q_embed_31 = None + key_31 = k_embed_31.contiguous() + value_31 = value_states_63.contiguous() + attn_output_124 = torch._C._nn.scaled_dot_product_attention( + query_31, + key_31, + value_31, + attn_mask=attention_mask_32, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_31 = key_31 = value_31 = attention_mask_32 = None + transpose_128 = attn_output_124.transpose(1, 2) + attn_output_124 = None + attn_output_125 = transpose_128.contiguous() + transpose_128 = None + reshape_31 = attn_output_125.reshape(1, 2, -1) + attn_output_125 = None + attn_output_126 = reshape_31.contiguous() + reshape_31 = None + attn_output_127 = torch._C._nn.linear( + attn_output_126, + l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_126 = l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_62 = torch.nn.functional.dropout(attn_output_127, 0.0, False, False) + attn_output_127 = None + hidden_states_282 = hidden_states_278 + dropout_62 + hidden_states_278 = dropout_62 = None + hidden_states_283 = hidden_states_282.to(torch.float32) + pow_64 = hidden_states_283.pow(2) + variance_63 = pow_64.mean(-1, keepdim=True) + pow_64 = None + add_190 = variance_63 + 1e-05 + variance_63 = None + rsqrt_63 = torch.rsqrt(add_190) + add_190 = None + hidden_states_284 = hidden_states_283 * rsqrt_63 + hidden_states_283 = rsqrt_63 = None + to_131 = hidden_states_284.to(torch.bfloat16) + hidden_states_284 = None + hidden_states_285 = ( + l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ + * to_131 + ) + l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ = ( + to_131 + ) = None + up_states_93 = torch._C._nn.linear( + hidden_states_285, + l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_285 = l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_31 = up_states_93.chunk(2, dim=-1) + up_states_93 = None + gate_31 = chunk_31[0] + up_states_94 = chunk_31[1] + chunk_31 = None + silu_31 = torch.nn.functional.silu(gate_31, inplace=False) + gate_31 = None + up_states_95 = up_states_94 * silu_31 + up_states_94 = silu_31 = None + hidden_states_286 = torch._C._nn.linear( + up_states_95, + l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_95 = l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_63 = torch.nn.functional.dropout(hidden_states_286, 0.0, False, False) + hidden_states_286 = None + hidden_states_287 = hidden_states_282 + dropout_63 + hidden_states_282 = dropout_63 = None + hidden_states_288 = hidden_states_287.to(torch.float32) + hidden_states_287 = None + pow_65 = hidden_states_288.pow(2) + variance_64 = pow_65.mean(-1, keepdim=True) + pow_65 = None + add_192 = variance_64 + 1e-05 + variance_64 = None + rsqrt_64 = torch.rsqrt(add_192) + add_192 = None + hidden_states_289 = hidden_states_288 * rsqrt_64 + hidden_states_288 = rsqrt_64 = None + to_133 = hidden_states_289.to(torch.bfloat16) + hidden_states_289 = None + hidden_states_290 = l_self_modules_norm_parameters_weight_ * to_133 + l_self_modules_norm_parameters_weight_ = to_133 = None + return ( + value_states_1, + k_embed, + value_states_3, + k_embed_1, + value_states_5, + k_embed_2, + value_states_7, + k_embed_3, + value_states_9, + k_embed_4, + value_states_11, + k_embed_5, + value_states_13, + k_embed_6, + value_states_15, + k_embed_7, + value_states_17, + k_embed_8, + value_states_19, + k_embed_9, + value_states_21, + k_embed_10, + value_states_23, + k_embed_11, + value_states_25, + k_embed_12, + value_states_27, + k_embed_13, + value_states_29, + k_embed_14, + value_states_31, + k_embed_15, + value_states_33, + k_embed_16, + value_states_35, + k_embed_17, + value_states_37, + k_embed_18, + value_states_39, + k_embed_19, + value_states_41, + k_embed_20, + value_states_43, + k_embed_21, + value_states_45, + k_embed_22, + value_states_47, + k_embed_23, + value_states_49, + k_embed_24, + value_states_51, + k_embed_25, + value_states_53, + k_embed_26, + value_states_55, + k_embed_27, + value_states_57, + k_embed_28, + value_states_59, + k_embed_29, + value_states_61, + k_embed_30, + value_states_63, + k_embed_31, + hidden_states_290, + ) diff --git a/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/weight_meta.py b/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/weight_meta.py new file mode 100644 index 000000000..87fa673b7 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/Phi-3-mini-4k-instruct/weight_meta.py @@ -0,0 +1,2007 @@ +class Program_weight_tensor_meta_L_inputs_embeds_: + name = "L_inputs_embeds_" + shape = [1, 2, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 2] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1] + + +class Program_weight_tensor_meta_L_self_modules_rotary_emb_buffers_inv_freq_: + name = "L_self_modules_rotary_emb_buffers_inv_freq_" + shape = [48] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.119 + std = 0.229 + data = [ + 1.000000, + 0.825404, + 0.681292, + 0.562341, + 0.464159, + 0.383119, + 0.316228, + 0.261016, + 0.215443, + 0.177828, + 0.146780, + 0.121153, + 0.100000, + 0.082540, + 0.068129, + 0.056234, + 0.046416, + 0.038312, + 0.031623, + 0.026102, + 0.021544, + 0.017783, + 0.014678, + 0.012115, + 0.010000, + 0.008254, + 0.006813, + 0.005623, + 0.004642, + 0.003831, + 0.003162, + 0.002610, + 0.002154, + 0.001778, + 0.001468, + 0.001212, + 0.001000, + 0.000825, + 0.000681, + 0.000562, + 0.000464, + 0.000383, + 0.000316, + 0.000261, + 0.000215, + 0.000178, + 0.000147, + 0.000121, + ] + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_norm_parameters_weight_: + name = "L_self_modules_norm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/graph_hash.txt b/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/graph_hash.txt new file mode 100644 index 000000000..b61194c16 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/graph_hash.txt @@ -0,0 +1 @@ +4be2fde33bf182b985db4c2c62e62b1cf11ce8bc64ee17004a9c3d626fc6a262 \ No newline at end of file diff --git a/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/graph_net.json b/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/input_meta.py b/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/input_tensor_constraints.py b/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/model.py b/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/model.py new file mode 100644 index 000000000..541227255 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/model.py @@ -0,0 +1,5736 @@ +import torch + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_hidden_states_: torch.Tensor, + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_stack0_0_: torch.Tensor, + L_stack0_1_: torch.Tensor, + L_causal_mask_: torch.Tensor, + L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_norm_parameters_weight_: torch.nn.parameter.Parameter, + ): + l_hidden_states_ = L_hidden_states_ + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_stack0_0_ = L_stack0_0_ + l_stack0_1_ = L_stack0_1_ + l_causal_mask_ = L_causal_mask_ + l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_norm_parameters_weight_ = L_self_modules_norm_parameters_weight_ + _log_api_usage_once = torch._C._log_api_usage_once("python.nn_module") + _log_api_usage_once = None + hidden_states = l_hidden_states_.to(torch.float32) + pow_1 = hidden_states.pow(2) + variance = pow_1.mean(-1, keepdim=True) + pow_1 = None + add = variance + 1e-05 + variance = None + rsqrt = torch.rsqrt(add) + add = None + hidden_states_1 = hidden_states * rsqrt + hidden_states = rsqrt = None + to_1 = hidden_states_1.to(torch.bfloat16) + hidden_states_1 = None + hidden_states_2 = ( + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + * to_1 + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + to_1 + ) = None + qkv = torch._C._nn.linear( + hidden_states_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_2 = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states = qkv[(Ellipsis, slice(None, 3072, None))] + key_states = qkv[(Ellipsis, slice(3072, 6144, None))] + value_states = qkv[(Ellipsis, slice(6144, None, None))] + qkv = None + view = query_states.view((1, 2, -1, 96)) + query_states = None + query_states_1 = view.transpose(1, 2) + view = None + view_1 = key_states.view((1, 2, -1, 96)) + key_states = None + key_states_1 = view_1.transpose(1, 2) + view_1 = None + view_2 = value_states.view((1, 2, -1, 96)) + value_states = None + value_states_1 = view_2.transpose(1, 2) + view_2 = None + cos = l_stack0_0_.unsqueeze(1) + sin = l_stack0_1_.unsqueeze(1) + q_rot = query_states_1[(Ellipsis, slice(None, 96, None))] + q_pass = query_states_1[(Ellipsis, slice(96, None, None))] + query_states_1 = None + k_rot = key_states_1[(Ellipsis, slice(None, 96, None))] + k_pass = key_states_1[(Ellipsis, slice(96, None, None))] + key_states_1 = None + mul_2 = q_rot * cos + x1 = q_rot[(Ellipsis, slice(None, 48, None))] + x2 = q_rot[(Ellipsis, slice(48, None, None))] + q_rot = None + neg = -x2 + x2 = None + cat = torch.cat((neg, x1), dim=-1) + neg = x1 = None + mul_3 = cat * sin + cat = None + add_1 = mul_2 + mul_3 + mul_2 = mul_3 = None + q_embed = torch.cat([add_1, q_pass], dim=-1) + add_1 = q_pass = None + mul_4 = k_rot * cos + cos = None + x1_1 = k_rot[(Ellipsis, slice(None, 48, None))] + x2_1 = k_rot[(Ellipsis, slice(48, None, None))] + k_rot = None + neg_1 = -x2_1 + x2_1 = None + cat_2 = torch.cat((neg_1, x1_1), dim=-1) + neg_1 = x1_1 = None + mul_5 = cat_2 * sin + cat_2 = sin = None + add_2 = mul_4 + mul_5 + mul_4 = mul_5 = None + k_embed = torch.cat([add_2, k_pass], dim=-1) + add_2 = k_pass = None + attention_mask = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query = q_embed.contiguous() + q_embed = None + key = k_embed.contiguous() + value = value_states_1.contiguous() + attn_output = torch._C._nn.scaled_dot_product_attention( + query, + key, + value, + attn_mask=attention_mask, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query = key = value = attention_mask = None + transpose_3 = attn_output.transpose(1, 2) + attn_output = None + attn_output_1 = transpose_3.contiguous() + transpose_3 = None + reshape = attn_output_1.reshape(1, 2, -1) + attn_output_1 = None + attn_output_2 = reshape.contiguous() + reshape = None + attn_output_3 = torch._C._nn.linear( + attn_output_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_2 = l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout = torch.nn.functional.dropout(attn_output_3, 0.0, False, False) + attn_output_3 = None + hidden_states_3 = l_hidden_states_ + dropout + l_hidden_states_ = dropout = None + hidden_states_4 = hidden_states_3.to(torch.float32) + pow_2 = hidden_states_4.pow(2) + variance_1 = pow_2.mean(-1, keepdim=True) + pow_2 = None + add_4 = variance_1 + 1e-05 + variance_1 = None + rsqrt_1 = torch.rsqrt(add_4) + add_4 = None + hidden_states_5 = hidden_states_4 * rsqrt_1 + hidden_states_4 = rsqrt_1 = None + to_3 = hidden_states_5.to(torch.bfloat16) + hidden_states_5 = None + hidden_states_6 = ( + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + * to_3 + ) + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = ( + to_3 + ) = None + up_states = torch._C._nn.linear( + hidden_states_6, + l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_6 = l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk = up_states.chunk(2, dim=-1) + up_states = None + gate = chunk[0] + up_states_1 = chunk[1] + chunk = None + silu = torch.nn.functional.silu(gate, inplace=False) + gate = None + up_states_2 = up_states_1 * silu + up_states_1 = silu = None + hidden_states_7 = torch._C._nn.linear( + up_states_2, + l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_2 = l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_1 = torch.nn.functional.dropout(hidden_states_7, 0.0, False, False) + hidden_states_7 = None + hidden_states_8 = hidden_states_3 + dropout_1 + hidden_states_3 = dropout_1 = None + hidden_states_9 = hidden_states_8.to(torch.float32) + pow_3 = hidden_states_9.pow(2) + variance_2 = pow_3.mean(-1, keepdim=True) + pow_3 = None + add_6 = variance_2 + 1e-05 + variance_2 = None + rsqrt_2 = torch.rsqrt(add_6) + add_6 = None + hidden_states_10 = hidden_states_9 * rsqrt_2 + hidden_states_9 = rsqrt_2 = None + to_5 = hidden_states_10.to(torch.bfloat16) + hidden_states_10 = None + hidden_states_11 = ( + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + * to_5 + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + to_5 + ) = None + qkv_1 = torch._C._nn.linear( + hidden_states_11, + l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_11 = l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_2 = qkv_1[(Ellipsis, slice(None, 3072, None))] + key_states_2 = qkv_1[(Ellipsis, slice(3072, 6144, None))] + value_states_2 = qkv_1[(Ellipsis, slice(6144, None, None))] + qkv_1 = None + view_3 = query_states_2.view((1, 2, -1, 96)) + query_states_2 = None + query_states_3 = view_3.transpose(1, 2) + view_3 = None + view_4 = key_states_2.view((1, 2, -1, 96)) + key_states_2 = None + key_states_3 = view_4.transpose(1, 2) + view_4 = None + view_5 = value_states_2.view((1, 2, -1, 96)) + value_states_2 = None + value_states_3 = view_5.transpose(1, 2) + view_5 = None + cos_1 = l_stack0_0_.unsqueeze(1) + sin_1 = l_stack0_1_.unsqueeze(1) + q_rot_1 = query_states_3[(Ellipsis, slice(None, 96, None))] + q_pass_1 = query_states_3[(Ellipsis, slice(96, None, None))] + query_states_3 = None + k_rot_1 = key_states_3[(Ellipsis, slice(None, 96, None))] + k_pass_1 = key_states_3[(Ellipsis, slice(96, None, None))] + key_states_3 = None + mul_11 = q_rot_1 * cos_1 + x1_2 = q_rot_1[(Ellipsis, slice(None, 48, None))] + x2_2 = q_rot_1[(Ellipsis, slice(48, None, None))] + q_rot_1 = None + neg_2 = -x2_2 + x2_2 = None + cat_4 = torch.cat((neg_2, x1_2), dim=-1) + neg_2 = x1_2 = None + mul_12 = cat_4 * sin_1 + cat_4 = None + add_7 = mul_11 + mul_12 + mul_11 = mul_12 = None + q_embed_1 = torch.cat([add_7, q_pass_1], dim=-1) + add_7 = q_pass_1 = None + mul_13 = k_rot_1 * cos_1 + cos_1 = None + x1_3 = k_rot_1[(Ellipsis, slice(None, 48, None))] + x2_3 = k_rot_1[(Ellipsis, slice(48, None, None))] + k_rot_1 = None + neg_3 = -x2_3 + x2_3 = None + cat_6 = torch.cat((neg_3, x1_3), dim=-1) + neg_3 = x1_3 = None + mul_14 = cat_6 * sin_1 + cat_6 = sin_1 = None + add_8 = mul_13 + mul_14 + mul_13 = mul_14 = None + k_embed_1 = torch.cat([add_8, k_pass_1], dim=-1) + add_8 = k_pass_1 = None + attention_mask_1 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_1 = q_embed_1.contiguous() + q_embed_1 = None + key_1 = k_embed_1.contiguous() + value_1 = value_states_3.contiguous() + attn_output_4 = torch._C._nn.scaled_dot_product_attention( + query_1, + key_1, + value_1, + attn_mask=attention_mask_1, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_1 = key_1 = value_1 = attention_mask_1 = None + transpose_7 = attn_output_4.transpose(1, 2) + attn_output_4 = None + attn_output_5 = transpose_7.contiguous() + transpose_7 = None + reshape_1 = attn_output_5.reshape(1, 2, -1) + attn_output_5 = None + attn_output_6 = reshape_1.contiguous() + reshape_1 = None + attn_output_7 = torch._C._nn.linear( + attn_output_6, + l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_6 = l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_2 = torch.nn.functional.dropout(attn_output_7, 0.0, False, False) + attn_output_7 = None + hidden_states_12 = hidden_states_8 + dropout_2 + hidden_states_8 = dropout_2 = None + hidden_states_13 = hidden_states_12.to(torch.float32) + pow_4 = hidden_states_13.pow(2) + variance_3 = pow_4.mean(-1, keepdim=True) + pow_4 = None + add_10 = variance_3 + 1e-05 + variance_3 = None + rsqrt_3 = torch.rsqrt(add_10) + add_10 = None + hidden_states_14 = hidden_states_13 * rsqrt_3 + hidden_states_13 = rsqrt_3 = None + to_7 = hidden_states_14.to(torch.bfloat16) + hidden_states_14 = None + hidden_states_15 = ( + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + * to_7 + ) + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = ( + to_7 + ) = None + up_states_3 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_15 = l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_1 = up_states_3.chunk(2, dim=-1) + up_states_3 = None + gate_1 = chunk_1[0] + up_states_4 = chunk_1[1] + chunk_1 = None + silu_1 = torch.nn.functional.silu(gate_1, inplace=False) + gate_1 = None + up_states_5 = up_states_4 * silu_1 + up_states_4 = silu_1 = None + hidden_states_16 = torch._C._nn.linear( + up_states_5, + l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_5 = l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_3 = torch.nn.functional.dropout(hidden_states_16, 0.0, False, False) + hidden_states_16 = None + hidden_states_17 = hidden_states_12 + dropout_3 + hidden_states_12 = dropout_3 = None + hidden_states_18 = hidden_states_17.to(torch.float32) + pow_5 = hidden_states_18.pow(2) + variance_4 = pow_5.mean(-1, keepdim=True) + pow_5 = None + add_12 = variance_4 + 1e-05 + variance_4 = None + rsqrt_4 = torch.rsqrt(add_12) + add_12 = None + hidden_states_19 = hidden_states_18 * rsqrt_4 + hidden_states_18 = rsqrt_4 = None + to_9 = hidden_states_19.to(torch.bfloat16) + hidden_states_19 = None + hidden_states_20 = ( + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + * to_9 + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + to_9 + ) = None + qkv_2 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_20 = l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_4 = qkv_2[(Ellipsis, slice(None, 3072, None))] + key_states_4 = qkv_2[(Ellipsis, slice(3072, 6144, None))] + value_states_4 = qkv_2[(Ellipsis, slice(6144, None, None))] + qkv_2 = None + view_6 = query_states_4.view((1, 2, -1, 96)) + query_states_4 = None + query_states_5 = view_6.transpose(1, 2) + view_6 = None + view_7 = key_states_4.view((1, 2, -1, 96)) + key_states_4 = None + key_states_5 = view_7.transpose(1, 2) + view_7 = None + view_8 = value_states_4.view((1, 2, -1, 96)) + value_states_4 = None + value_states_5 = view_8.transpose(1, 2) + view_8 = None + cos_2 = l_stack0_0_.unsqueeze(1) + sin_2 = l_stack0_1_.unsqueeze(1) + q_rot_2 = query_states_5[(Ellipsis, slice(None, 96, None))] + q_pass_2 = query_states_5[(Ellipsis, slice(96, None, None))] + query_states_5 = None + k_rot_2 = key_states_5[(Ellipsis, slice(None, 96, None))] + k_pass_2 = key_states_5[(Ellipsis, slice(96, None, None))] + key_states_5 = None + mul_20 = q_rot_2 * cos_2 + x1_4 = q_rot_2[(Ellipsis, slice(None, 48, None))] + x2_4 = q_rot_2[(Ellipsis, slice(48, None, None))] + q_rot_2 = None + neg_4 = -x2_4 + x2_4 = None + cat_8 = torch.cat((neg_4, x1_4), dim=-1) + neg_4 = x1_4 = None + mul_21 = cat_8 * sin_2 + cat_8 = None + add_13 = mul_20 + mul_21 + mul_20 = mul_21 = None + q_embed_2 = torch.cat([add_13, q_pass_2], dim=-1) + add_13 = q_pass_2 = None + mul_22 = k_rot_2 * cos_2 + cos_2 = None + x1_5 = k_rot_2[(Ellipsis, slice(None, 48, None))] + x2_5 = k_rot_2[(Ellipsis, slice(48, None, None))] + k_rot_2 = None + neg_5 = -x2_5 + x2_5 = None + cat_10 = torch.cat((neg_5, x1_5), dim=-1) + neg_5 = x1_5 = None + mul_23 = cat_10 * sin_2 + cat_10 = sin_2 = None + add_14 = mul_22 + mul_23 + mul_22 = mul_23 = None + k_embed_2 = torch.cat([add_14, k_pass_2], dim=-1) + add_14 = k_pass_2 = None + attention_mask_2 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_2 = q_embed_2.contiguous() + q_embed_2 = None + key_2 = k_embed_2.contiguous() + value_2 = value_states_5.contiguous() + attn_output_8 = torch._C._nn.scaled_dot_product_attention( + query_2, + key_2, + value_2, + attn_mask=attention_mask_2, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_2 = key_2 = value_2 = attention_mask_2 = None + transpose_11 = attn_output_8.transpose(1, 2) + attn_output_8 = None + attn_output_9 = transpose_11.contiguous() + transpose_11 = None + reshape_2 = attn_output_9.reshape(1, 2, -1) + attn_output_9 = None + attn_output_10 = reshape_2.contiguous() + reshape_2 = None + attn_output_11 = torch._C._nn.linear( + attn_output_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_10 = l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_4 = torch.nn.functional.dropout(attn_output_11, 0.0, False, False) + attn_output_11 = None + hidden_states_21 = hidden_states_17 + dropout_4 + hidden_states_17 = dropout_4 = None + hidden_states_22 = hidden_states_21.to(torch.float32) + pow_6 = hidden_states_22.pow(2) + variance_5 = pow_6.mean(-1, keepdim=True) + pow_6 = None + add_16 = variance_5 + 1e-05 + variance_5 = None + rsqrt_5 = torch.rsqrt(add_16) + add_16 = None + hidden_states_23 = hidden_states_22 * rsqrt_5 + hidden_states_22 = rsqrt_5 = None + to_11 = hidden_states_23.to(torch.bfloat16) + hidden_states_23 = None + hidden_states_24 = ( + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + * to_11 + ) + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = ( + to_11 + ) = None + up_states_6 = torch._C._nn.linear( + hidden_states_24, + l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_24 = l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_2 = up_states_6.chunk(2, dim=-1) + up_states_6 = None + gate_2 = chunk_2[0] + up_states_7 = chunk_2[1] + chunk_2 = None + silu_2 = torch.nn.functional.silu(gate_2, inplace=False) + gate_2 = None + up_states_8 = up_states_7 * silu_2 + up_states_7 = silu_2 = None + hidden_states_25 = torch._C._nn.linear( + up_states_8, + l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_8 = l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_5 = torch.nn.functional.dropout(hidden_states_25, 0.0, False, False) + hidden_states_25 = None + hidden_states_26 = hidden_states_21 + dropout_5 + hidden_states_21 = dropout_5 = None + hidden_states_27 = hidden_states_26.to(torch.float32) + pow_7 = hidden_states_27.pow(2) + variance_6 = pow_7.mean(-1, keepdim=True) + pow_7 = None + add_18 = variance_6 + 1e-05 + variance_6 = None + rsqrt_6 = torch.rsqrt(add_18) + add_18 = None + hidden_states_28 = hidden_states_27 * rsqrt_6 + hidden_states_27 = rsqrt_6 = None + to_13 = hidden_states_28.to(torch.bfloat16) + hidden_states_28 = None + hidden_states_29 = ( + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + * to_13 + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + to_13 + ) = None + qkv_3 = torch._C._nn.linear( + hidden_states_29, + l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_29 = l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_6 = qkv_3[(Ellipsis, slice(None, 3072, None))] + key_states_6 = qkv_3[(Ellipsis, slice(3072, 6144, None))] + value_states_6 = qkv_3[(Ellipsis, slice(6144, None, None))] + qkv_3 = None + view_9 = query_states_6.view((1, 2, -1, 96)) + query_states_6 = None + query_states_7 = view_9.transpose(1, 2) + view_9 = None + view_10 = key_states_6.view((1, 2, -1, 96)) + key_states_6 = None + key_states_7 = view_10.transpose(1, 2) + view_10 = None + view_11 = value_states_6.view((1, 2, -1, 96)) + value_states_6 = None + value_states_7 = view_11.transpose(1, 2) + view_11 = None + cos_3 = l_stack0_0_.unsqueeze(1) + sin_3 = l_stack0_1_.unsqueeze(1) + q_rot_3 = query_states_7[(Ellipsis, slice(None, 96, None))] + q_pass_3 = query_states_7[(Ellipsis, slice(96, None, None))] + query_states_7 = None + k_rot_3 = key_states_7[(Ellipsis, slice(None, 96, None))] + k_pass_3 = key_states_7[(Ellipsis, slice(96, None, None))] + key_states_7 = None + mul_29 = q_rot_3 * cos_3 + x1_6 = q_rot_3[(Ellipsis, slice(None, 48, None))] + x2_6 = q_rot_3[(Ellipsis, slice(48, None, None))] + q_rot_3 = None + neg_6 = -x2_6 + x2_6 = None + cat_12 = torch.cat((neg_6, x1_6), dim=-1) + neg_6 = x1_6 = None + mul_30 = cat_12 * sin_3 + cat_12 = None + add_19 = mul_29 + mul_30 + mul_29 = mul_30 = None + q_embed_3 = torch.cat([add_19, q_pass_3], dim=-1) + add_19 = q_pass_3 = None + mul_31 = k_rot_3 * cos_3 + cos_3 = None + x1_7 = k_rot_3[(Ellipsis, slice(None, 48, None))] + x2_7 = k_rot_3[(Ellipsis, slice(48, None, None))] + k_rot_3 = None + neg_7 = -x2_7 + x2_7 = None + cat_14 = torch.cat((neg_7, x1_7), dim=-1) + neg_7 = x1_7 = None + mul_32 = cat_14 * sin_3 + cat_14 = sin_3 = None + add_20 = mul_31 + mul_32 + mul_31 = mul_32 = None + k_embed_3 = torch.cat([add_20, k_pass_3], dim=-1) + add_20 = k_pass_3 = None + attention_mask_3 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_3 = q_embed_3.contiguous() + q_embed_3 = None + key_3 = k_embed_3.contiguous() + value_3 = value_states_7.contiguous() + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_3, + key_3, + value_3, + attn_mask=attention_mask_3, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_3 = key_3 = value_3 = attention_mask_3 = None + transpose_15 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_13 = transpose_15.contiguous() + transpose_15 = None + reshape_3 = attn_output_13.reshape(1, 2, -1) + attn_output_13 = None + attn_output_14 = reshape_3.contiguous() + reshape_3 = None + attn_output_15 = torch._C._nn.linear( + attn_output_14, + l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_14 = l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_6 = torch.nn.functional.dropout(attn_output_15, 0.0, False, False) + attn_output_15 = None + hidden_states_30 = hidden_states_26 + dropout_6 + hidden_states_26 = dropout_6 = None + hidden_states_31 = hidden_states_30.to(torch.float32) + pow_8 = hidden_states_31.pow(2) + variance_7 = pow_8.mean(-1, keepdim=True) + pow_8 = None + add_22 = variance_7 + 1e-05 + variance_7 = None + rsqrt_7 = torch.rsqrt(add_22) + add_22 = None + hidden_states_32 = hidden_states_31 * rsqrt_7 + hidden_states_31 = rsqrt_7 = None + to_15 = hidden_states_32.to(torch.bfloat16) + hidden_states_32 = None + hidden_states_33 = ( + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + * to_15 + ) + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = ( + to_15 + ) = None + up_states_9 = torch._C._nn.linear( + hidden_states_33, + l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_33 = l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_3 = up_states_9.chunk(2, dim=-1) + up_states_9 = None + gate_3 = chunk_3[0] + up_states_10 = chunk_3[1] + chunk_3 = None + silu_3 = torch.nn.functional.silu(gate_3, inplace=False) + gate_3 = None + up_states_11 = up_states_10 * silu_3 + up_states_10 = silu_3 = None + hidden_states_34 = torch._C._nn.linear( + up_states_11, + l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_11 = l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_7 = torch.nn.functional.dropout(hidden_states_34, 0.0, False, False) + hidden_states_34 = None + hidden_states_35 = hidden_states_30 + dropout_7 + hidden_states_30 = dropout_7 = None + hidden_states_36 = hidden_states_35.to(torch.float32) + pow_9 = hidden_states_36.pow(2) + variance_8 = pow_9.mean(-1, keepdim=True) + pow_9 = None + add_24 = variance_8 + 1e-05 + variance_8 = None + rsqrt_8 = torch.rsqrt(add_24) + add_24 = None + hidden_states_37 = hidden_states_36 * rsqrt_8 + hidden_states_36 = rsqrt_8 = None + to_17 = hidden_states_37.to(torch.bfloat16) + hidden_states_37 = None + hidden_states_38 = ( + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + * to_17 + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + to_17 + ) = None + qkv_4 = torch._C._nn.linear( + hidden_states_38, + l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_38 = l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_8 = qkv_4[(Ellipsis, slice(None, 3072, None))] + key_states_8 = qkv_4[(Ellipsis, slice(3072, 6144, None))] + value_states_8 = qkv_4[(Ellipsis, slice(6144, None, None))] + qkv_4 = None + view_12 = query_states_8.view((1, 2, -1, 96)) + query_states_8 = None + query_states_9 = view_12.transpose(1, 2) + view_12 = None + view_13 = key_states_8.view((1, 2, -1, 96)) + key_states_8 = None + key_states_9 = view_13.transpose(1, 2) + view_13 = None + view_14 = value_states_8.view((1, 2, -1, 96)) + value_states_8 = None + value_states_9 = view_14.transpose(1, 2) + view_14 = None + cos_4 = l_stack0_0_.unsqueeze(1) + sin_4 = l_stack0_1_.unsqueeze(1) + q_rot_4 = query_states_9[(Ellipsis, slice(None, 96, None))] + q_pass_4 = query_states_9[(Ellipsis, slice(96, None, None))] + query_states_9 = None + k_rot_4 = key_states_9[(Ellipsis, slice(None, 96, None))] + k_pass_4 = key_states_9[(Ellipsis, slice(96, None, None))] + key_states_9 = None + mul_38 = q_rot_4 * cos_4 + x1_8 = q_rot_4[(Ellipsis, slice(None, 48, None))] + x2_8 = q_rot_4[(Ellipsis, slice(48, None, None))] + q_rot_4 = None + neg_8 = -x2_8 + x2_8 = None + cat_16 = torch.cat((neg_8, x1_8), dim=-1) + neg_8 = x1_8 = None + mul_39 = cat_16 * sin_4 + cat_16 = None + add_25 = mul_38 + mul_39 + mul_38 = mul_39 = None + q_embed_4 = torch.cat([add_25, q_pass_4], dim=-1) + add_25 = q_pass_4 = None + mul_40 = k_rot_4 * cos_4 + cos_4 = None + x1_9 = k_rot_4[(Ellipsis, slice(None, 48, None))] + x2_9 = k_rot_4[(Ellipsis, slice(48, None, None))] + k_rot_4 = None + neg_9 = -x2_9 + x2_9 = None + cat_18 = torch.cat((neg_9, x1_9), dim=-1) + neg_9 = x1_9 = None + mul_41 = cat_18 * sin_4 + cat_18 = sin_4 = None + add_26 = mul_40 + mul_41 + mul_40 = mul_41 = None + k_embed_4 = torch.cat([add_26, k_pass_4], dim=-1) + add_26 = k_pass_4 = None + attention_mask_4 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_4 = q_embed_4.contiguous() + q_embed_4 = None + key_4 = k_embed_4.contiguous() + value_4 = value_states_9.contiguous() + attn_output_16 = torch._C._nn.scaled_dot_product_attention( + query_4, + key_4, + value_4, + attn_mask=attention_mask_4, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_4 = key_4 = value_4 = attention_mask_4 = None + transpose_19 = attn_output_16.transpose(1, 2) + attn_output_16 = None + attn_output_17 = transpose_19.contiguous() + transpose_19 = None + reshape_4 = attn_output_17.reshape(1, 2, -1) + attn_output_17 = None + attn_output_18 = reshape_4.contiguous() + reshape_4 = None + attn_output_19 = torch._C._nn.linear( + attn_output_18, + l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_18 = l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_8 = torch.nn.functional.dropout(attn_output_19, 0.0, False, False) + attn_output_19 = None + hidden_states_39 = hidden_states_35 + dropout_8 + hidden_states_35 = dropout_8 = None + hidden_states_40 = hidden_states_39.to(torch.float32) + pow_10 = hidden_states_40.pow(2) + variance_9 = pow_10.mean(-1, keepdim=True) + pow_10 = None + add_28 = variance_9 + 1e-05 + variance_9 = None + rsqrt_9 = torch.rsqrt(add_28) + add_28 = None + hidden_states_41 = hidden_states_40 * rsqrt_9 + hidden_states_40 = rsqrt_9 = None + to_19 = hidden_states_41.to(torch.bfloat16) + hidden_states_41 = None + hidden_states_42 = ( + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + * to_19 + ) + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = ( + to_19 + ) = None + up_states_12 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_42 = l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_4 = up_states_12.chunk(2, dim=-1) + up_states_12 = None + gate_4 = chunk_4[0] + up_states_13 = chunk_4[1] + chunk_4 = None + silu_4 = torch.nn.functional.silu(gate_4, inplace=False) + gate_4 = None + up_states_14 = up_states_13 * silu_4 + up_states_13 = silu_4 = None + hidden_states_43 = torch._C._nn.linear( + up_states_14, + l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_14 = l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_9 = torch.nn.functional.dropout(hidden_states_43, 0.0, False, False) + hidden_states_43 = None + hidden_states_44 = hidden_states_39 + dropout_9 + hidden_states_39 = dropout_9 = None + hidden_states_45 = hidden_states_44.to(torch.float32) + pow_11 = hidden_states_45.pow(2) + variance_10 = pow_11.mean(-1, keepdim=True) + pow_11 = None + add_30 = variance_10 + 1e-05 + variance_10 = None + rsqrt_10 = torch.rsqrt(add_30) + add_30 = None + hidden_states_46 = hidden_states_45 * rsqrt_10 + hidden_states_45 = rsqrt_10 = None + to_21 = hidden_states_46.to(torch.bfloat16) + hidden_states_46 = None + hidden_states_47 = ( + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + * to_21 + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + to_21 + ) = None + qkv_5 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_47 = l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_10 = qkv_5[(Ellipsis, slice(None, 3072, None))] + key_states_10 = qkv_5[(Ellipsis, slice(3072, 6144, None))] + value_states_10 = qkv_5[(Ellipsis, slice(6144, None, None))] + qkv_5 = None + view_15 = query_states_10.view((1, 2, -1, 96)) + query_states_10 = None + query_states_11 = view_15.transpose(1, 2) + view_15 = None + view_16 = key_states_10.view((1, 2, -1, 96)) + key_states_10 = None + key_states_11 = view_16.transpose(1, 2) + view_16 = None + view_17 = value_states_10.view((1, 2, -1, 96)) + value_states_10 = None + value_states_11 = view_17.transpose(1, 2) + view_17 = None + cos_5 = l_stack0_0_.unsqueeze(1) + sin_5 = l_stack0_1_.unsqueeze(1) + q_rot_5 = query_states_11[(Ellipsis, slice(None, 96, None))] + q_pass_5 = query_states_11[(Ellipsis, slice(96, None, None))] + query_states_11 = None + k_rot_5 = key_states_11[(Ellipsis, slice(None, 96, None))] + k_pass_5 = key_states_11[(Ellipsis, slice(96, None, None))] + key_states_11 = None + mul_47 = q_rot_5 * cos_5 + x1_10 = q_rot_5[(Ellipsis, slice(None, 48, None))] + x2_10 = q_rot_5[(Ellipsis, slice(48, None, None))] + q_rot_5 = None + neg_10 = -x2_10 + x2_10 = None + cat_20 = torch.cat((neg_10, x1_10), dim=-1) + neg_10 = x1_10 = None + mul_48 = cat_20 * sin_5 + cat_20 = None + add_31 = mul_47 + mul_48 + mul_47 = mul_48 = None + q_embed_5 = torch.cat([add_31, q_pass_5], dim=-1) + add_31 = q_pass_5 = None + mul_49 = k_rot_5 * cos_5 + cos_5 = None + x1_11 = k_rot_5[(Ellipsis, slice(None, 48, None))] + x2_11 = k_rot_5[(Ellipsis, slice(48, None, None))] + k_rot_5 = None + neg_11 = -x2_11 + x2_11 = None + cat_22 = torch.cat((neg_11, x1_11), dim=-1) + neg_11 = x1_11 = None + mul_50 = cat_22 * sin_5 + cat_22 = sin_5 = None + add_32 = mul_49 + mul_50 + mul_49 = mul_50 = None + k_embed_5 = torch.cat([add_32, k_pass_5], dim=-1) + add_32 = k_pass_5 = None + attention_mask_5 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_5 = q_embed_5.contiguous() + q_embed_5 = None + key_5 = k_embed_5.contiguous() + value_5 = value_states_11.contiguous() + attn_output_20 = torch._C._nn.scaled_dot_product_attention( + query_5, + key_5, + value_5, + attn_mask=attention_mask_5, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_5 = key_5 = value_5 = attention_mask_5 = None + transpose_23 = attn_output_20.transpose(1, 2) + attn_output_20 = None + attn_output_21 = transpose_23.contiguous() + transpose_23 = None + reshape_5 = attn_output_21.reshape(1, 2, -1) + attn_output_21 = None + attn_output_22 = reshape_5.contiguous() + reshape_5 = None + attn_output_23 = torch._C._nn.linear( + attn_output_22, + l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_22 = l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_10 = torch.nn.functional.dropout(attn_output_23, 0.0, False, False) + attn_output_23 = None + hidden_states_48 = hidden_states_44 + dropout_10 + hidden_states_44 = dropout_10 = None + hidden_states_49 = hidden_states_48.to(torch.float32) + pow_12 = hidden_states_49.pow(2) + variance_11 = pow_12.mean(-1, keepdim=True) + pow_12 = None + add_34 = variance_11 + 1e-05 + variance_11 = None + rsqrt_11 = torch.rsqrt(add_34) + add_34 = None + hidden_states_50 = hidden_states_49 * rsqrt_11 + hidden_states_49 = rsqrt_11 = None + to_23 = hidden_states_50.to(torch.bfloat16) + hidden_states_50 = None + hidden_states_51 = ( + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + * to_23 + ) + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = ( + to_23 + ) = None + up_states_15 = torch._C._nn.linear( + hidden_states_51, + l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_51 = l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_5 = up_states_15.chunk(2, dim=-1) + up_states_15 = None + gate_5 = chunk_5[0] + up_states_16 = chunk_5[1] + chunk_5 = None + silu_5 = torch.nn.functional.silu(gate_5, inplace=False) + gate_5 = None + up_states_17 = up_states_16 * silu_5 + up_states_16 = silu_5 = None + hidden_states_52 = torch._C._nn.linear( + up_states_17, + l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_17 = l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_11 = torch.nn.functional.dropout(hidden_states_52, 0.0, False, False) + hidden_states_52 = None + hidden_states_53 = hidden_states_48 + dropout_11 + hidden_states_48 = dropout_11 = None + hidden_states_54 = hidden_states_53.to(torch.float32) + pow_13 = hidden_states_54.pow(2) + variance_12 = pow_13.mean(-1, keepdim=True) + pow_13 = None + add_36 = variance_12 + 1e-05 + variance_12 = None + rsqrt_12 = torch.rsqrt(add_36) + add_36 = None + hidden_states_55 = hidden_states_54 * rsqrt_12 + hidden_states_54 = rsqrt_12 = None + to_25 = hidden_states_55.to(torch.bfloat16) + hidden_states_55 = None + hidden_states_56 = ( + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + * to_25 + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + to_25 + ) = None + qkv_6 = torch._C._nn.linear( + hidden_states_56, + l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_56 = l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_12 = qkv_6[(Ellipsis, slice(None, 3072, None))] + key_states_12 = qkv_6[(Ellipsis, slice(3072, 6144, None))] + value_states_12 = qkv_6[(Ellipsis, slice(6144, None, None))] + qkv_6 = None + view_18 = query_states_12.view((1, 2, -1, 96)) + query_states_12 = None + query_states_13 = view_18.transpose(1, 2) + view_18 = None + view_19 = key_states_12.view((1, 2, -1, 96)) + key_states_12 = None + key_states_13 = view_19.transpose(1, 2) + view_19 = None + view_20 = value_states_12.view((1, 2, -1, 96)) + value_states_12 = None + value_states_13 = view_20.transpose(1, 2) + view_20 = None + cos_6 = l_stack0_0_.unsqueeze(1) + sin_6 = l_stack0_1_.unsqueeze(1) + q_rot_6 = query_states_13[(Ellipsis, slice(None, 96, None))] + q_pass_6 = query_states_13[(Ellipsis, slice(96, None, None))] + query_states_13 = None + k_rot_6 = key_states_13[(Ellipsis, slice(None, 96, None))] + k_pass_6 = key_states_13[(Ellipsis, slice(96, None, None))] + key_states_13 = None + mul_56 = q_rot_6 * cos_6 + x1_12 = q_rot_6[(Ellipsis, slice(None, 48, None))] + x2_12 = q_rot_6[(Ellipsis, slice(48, None, None))] + q_rot_6 = None + neg_12 = -x2_12 + x2_12 = None + cat_24 = torch.cat((neg_12, x1_12), dim=-1) + neg_12 = x1_12 = None + mul_57 = cat_24 * sin_6 + cat_24 = None + add_37 = mul_56 + mul_57 + mul_56 = mul_57 = None + q_embed_6 = torch.cat([add_37, q_pass_6], dim=-1) + add_37 = q_pass_6 = None + mul_58 = k_rot_6 * cos_6 + cos_6 = None + x1_13 = k_rot_6[(Ellipsis, slice(None, 48, None))] + x2_13 = k_rot_6[(Ellipsis, slice(48, None, None))] + k_rot_6 = None + neg_13 = -x2_13 + x2_13 = None + cat_26 = torch.cat((neg_13, x1_13), dim=-1) + neg_13 = x1_13 = None + mul_59 = cat_26 * sin_6 + cat_26 = sin_6 = None + add_38 = mul_58 + mul_59 + mul_58 = mul_59 = None + k_embed_6 = torch.cat([add_38, k_pass_6], dim=-1) + add_38 = k_pass_6 = None + attention_mask_6 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_6 = q_embed_6.contiguous() + q_embed_6 = None + key_6 = k_embed_6.contiguous() + value_6 = value_states_13.contiguous() + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + query_6, + key_6, + value_6, + attn_mask=attention_mask_6, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_6 = key_6 = value_6 = attention_mask_6 = None + transpose_27 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_25 = transpose_27.contiguous() + transpose_27 = None + reshape_6 = attn_output_25.reshape(1, 2, -1) + attn_output_25 = None + attn_output_26 = reshape_6.contiguous() + reshape_6 = None + attn_output_27 = torch._C._nn.linear( + attn_output_26, + l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_26 = l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_12 = torch.nn.functional.dropout(attn_output_27, 0.0, False, False) + attn_output_27 = None + hidden_states_57 = hidden_states_53 + dropout_12 + hidden_states_53 = dropout_12 = None + hidden_states_58 = hidden_states_57.to(torch.float32) + pow_14 = hidden_states_58.pow(2) + variance_13 = pow_14.mean(-1, keepdim=True) + pow_14 = None + add_40 = variance_13 + 1e-05 + variance_13 = None + rsqrt_13 = torch.rsqrt(add_40) + add_40 = None + hidden_states_59 = hidden_states_58 * rsqrt_13 + hidden_states_58 = rsqrt_13 = None + to_27 = hidden_states_59.to(torch.bfloat16) + hidden_states_59 = None + hidden_states_60 = ( + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + * to_27 + ) + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = ( + to_27 + ) = None + up_states_18 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_60 = l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_6 = up_states_18.chunk(2, dim=-1) + up_states_18 = None + gate_6 = chunk_6[0] + up_states_19 = chunk_6[1] + chunk_6 = None + silu_6 = torch.nn.functional.silu(gate_6, inplace=False) + gate_6 = None + up_states_20 = up_states_19 * silu_6 + up_states_19 = silu_6 = None + hidden_states_61 = torch._C._nn.linear( + up_states_20, + l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_20 = l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_13 = torch.nn.functional.dropout(hidden_states_61, 0.0, False, False) + hidden_states_61 = None + hidden_states_62 = hidden_states_57 + dropout_13 + hidden_states_57 = dropout_13 = None + hidden_states_63 = hidden_states_62.to(torch.float32) + pow_15 = hidden_states_63.pow(2) + variance_14 = pow_15.mean(-1, keepdim=True) + pow_15 = None + add_42 = variance_14 + 1e-05 + variance_14 = None + rsqrt_14 = torch.rsqrt(add_42) + add_42 = None + hidden_states_64 = hidden_states_63 * rsqrt_14 + hidden_states_63 = rsqrt_14 = None + to_29 = hidden_states_64.to(torch.bfloat16) + hidden_states_64 = None + hidden_states_65 = ( + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + * to_29 + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + to_29 + ) = None + qkv_7 = torch._C._nn.linear( + hidden_states_65, + l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_65 = l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_14 = qkv_7[(Ellipsis, slice(None, 3072, None))] + key_states_14 = qkv_7[(Ellipsis, slice(3072, 6144, None))] + value_states_14 = qkv_7[(Ellipsis, slice(6144, None, None))] + qkv_7 = None + view_21 = query_states_14.view((1, 2, -1, 96)) + query_states_14 = None + query_states_15 = view_21.transpose(1, 2) + view_21 = None + view_22 = key_states_14.view((1, 2, -1, 96)) + key_states_14 = None + key_states_15 = view_22.transpose(1, 2) + view_22 = None + view_23 = value_states_14.view((1, 2, -1, 96)) + value_states_14 = None + value_states_15 = view_23.transpose(1, 2) + view_23 = None + cos_7 = l_stack0_0_.unsqueeze(1) + sin_7 = l_stack0_1_.unsqueeze(1) + q_rot_7 = query_states_15[(Ellipsis, slice(None, 96, None))] + q_pass_7 = query_states_15[(Ellipsis, slice(96, None, None))] + query_states_15 = None + k_rot_7 = key_states_15[(Ellipsis, slice(None, 96, None))] + k_pass_7 = key_states_15[(Ellipsis, slice(96, None, None))] + key_states_15 = None + mul_65 = q_rot_7 * cos_7 + x1_14 = q_rot_7[(Ellipsis, slice(None, 48, None))] + x2_14 = q_rot_7[(Ellipsis, slice(48, None, None))] + q_rot_7 = None + neg_14 = -x2_14 + x2_14 = None + cat_28 = torch.cat((neg_14, x1_14), dim=-1) + neg_14 = x1_14 = None + mul_66 = cat_28 * sin_7 + cat_28 = None + add_43 = mul_65 + mul_66 + mul_65 = mul_66 = None + q_embed_7 = torch.cat([add_43, q_pass_7], dim=-1) + add_43 = q_pass_7 = None + mul_67 = k_rot_7 * cos_7 + cos_7 = None + x1_15 = k_rot_7[(Ellipsis, slice(None, 48, None))] + x2_15 = k_rot_7[(Ellipsis, slice(48, None, None))] + k_rot_7 = None + neg_15 = -x2_15 + x2_15 = None + cat_30 = torch.cat((neg_15, x1_15), dim=-1) + neg_15 = x1_15 = None + mul_68 = cat_30 * sin_7 + cat_30 = sin_7 = None + add_44 = mul_67 + mul_68 + mul_67 = mul_68 = None + k_embed_7 = torch.cat([add_44, k_pass_7], dim=-1) + add_44 = k_pass_7 = None + attention_mask_7 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_7 = q_embed_7.contiguous() + q_embed_7 = None + key_7 = k_embed_7.contiguous() + value_7 = value_states_15.contiguous() + attn_output_28 = torch._C._nn.scaled_dot_product_attention( + query_7, + key_7, + value_7, + attn_mask=attention_mask_7, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_7 = key_7 = value_7 = attention_mask_7 = None + transpose_31 = attn_output_28.transpose(1, 2) + attn_output_28 = None + attn_output_29 = transpose_31.contiguous() + transpose_31 = None + reshape_7 = attn_output_29.reshape(1, 2, -1) + attn_output_29 = None + attn_output_30 = reshape_7.contiguous() + reshape_7 = None + attn_output_31 = torch._C._nn.linear( + attn_output_30, + l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_30 = l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_14 = torch.nn.functional.dropout(attn_output_31, 0.0, False, False) + attn_output_31 = None + hidden_states_66 = hidden_states_62 + dropout_14 + hidden_states_62 = dropout_14 = None + hidden_states_67 = hidden_states_66.to(torch.float32) + pow_16 = hidden_states_67.pow(2) + variance_15 = pow_16.mean(-1, keepdim=True) + pow_16 = None + add_46 = variance_15 + 1e-05 + variance_15 = None + rsqrt_15 = torch.rsqrt(add_46) + add_46 = None + hidden_states_68 = hidden_states_67 * rsqrt_15 + hidden_states_67 = rsqrt_15 = None + to_31 = hidden_states_68.to(torch.bfloat16) + hidden_states_68 = None + hidden_states_69 = ( + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + * to_31 + ) + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = ( + to_31 + ) = None + up_states_21 = torch._C._nn.linear( + hidden_states_69, + l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_69 = l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_7 = up_states_21.chunk(2, dim=-1) + up_states_21 = None + gate_7 = chunk_7[0] + up_states_22 = chunk_7[1] + chunk_7 = None + silu_7 = torch.nn.functional.silu(gate_7, inplace=False) + gate_7 = None + up_states_23 = up_states_22 * silu_7 + up_states_22 = silu_7 = None + hidden_states_70 = torch._C._nn.linear( + up_states_23, + l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_23 = l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_15 = torch.nn.functional.dropout(hidden_states_70, 0.0, False, False) + hidden_states_70 = None + hidden_states_71 = hidden_states_66 + dropout_15 + hidden_states_66 = dropout_15 = None + hidden_states_72 = hidden_states_71.to(torch.float32) + pow_17 = hidden_states_72.pow(2) + variance_16 = pow_17.mean(-1, keepdim=True) + pow_17 = None + add_48 = variance_16 + 1e-05 + variance_16 = None + rsqrt_16 = torch.rsqrt(add_48) + add_48 = None + hidden_states_73 = hidden_states_72 * rsqrt_16 + hidden_states_72 = rsqrt_16 = None + to_33 = hidden_states_73.to(torch.bfloat16) + hidden_states_73 = None + hidden_states_74 = ( + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + * to_33 + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + to_33 + ) = None + qkv_8 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_74 = l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_16 = qkv_8[(Ellipsis, slice(None, 3072, None))] + key_states_16 = qkv_8[(Ellipsis, slice(3072, 6144, None))] + value_states_16 = qkv_8[(Ellipsis, slice(6144, None, None))] + qkv_8 = None + view_24 = query_states_16.view((1, 2, -1, 96)) + query_states_16 = None + query_states_17 = view_24.transpose(1, 2) + view_24 = None + view_25 = key_states_16.view((1, 2, -1, 96)) + key_states_16 = None + key_states_17 = view_25.transpose(1, 2) + view_25 = None + view_26 = value_states_16.view((1, 2, -1, 96)) + value_states_16 = None + value_states_17 = view_26.transpose(1, 2) + view_26 = None + cos_8 = l_stack0_0_.unsqueeze(1) + sin_8 = l_stack0_1_.unsqueeze(1) + q_rot_8 = query_states_17[(Ellipsis, slice(None, 96, None))] + q_pass_8 = query_states_17[(Ellipsis, slice(96, None, None))] + query_states_17 = None + k_rot_8 = key_states_17[(Ellipsis, slice(None, 96, None))] + k_pass_8 = key_states_17[(Ellipsis, slice(96, None, None))] + key_states_17 = None + mul_74 = q_rot_8 * cos_8 + x1_16 = q_rot_8[(Ellipsis, slice(None, 48, None))] + x2_16 = q_rot_8[(Ellipsis, slice(48, None, None))] + q_rot_8 = None + neg_16 = -x2_16 + x2_16 = None + cat_32 = torch.cat((neg_16, x1_16), dim=-1) + neg_16 = x1_16 = None + mul_75 = cat_32 * sin_8 + cat_32 = None + add_49 = mul_74 + mul_75 + mul_74 = mul_75 = None + q_embed_8 = torch.cat([add_49, q_pass_8], dim=-1) + add_49 = q_pass_8 = None + mul_76 = k_rot_8 * cos_8 + cos_8 = None + x1_17 = k_rot_8[(Ellipsis, slice(None, 48, None))] + x2_17 = k_rot_8[(Ellipsis, slice(48, None, None))] + k_rot_8 = None + neg_17 = -x2_17 + x2_17 = None + cat_34 = torch.cat((neg_17, x1_17), dim=-1) + neg_17 = x1_17 = None + mul_77 = cat_34 * sin_8 + cat_34 = sin_8 = None + add_50 = mul_76 + mul_77 + mul_76 = mul_77 = None + k_embed_8 = torch.cat([add_50, k_pass_8], dim=-1) + add_50 = k_pass_8 = None + attention_mask_8 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_8 = q_embed_8.contiguous() + q_embed_8 = None + key_8 = k_embed_8.contiguous() + value_8 = value_states_17.contiguous() + attn_output_32 = torch._C._nn.scaled_dot_product_attention( + query_8, + key_8, + value_8, + attn_mask=attention_mask_8, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_8 = key_8 = value_8 = attention_mask_8 = None + transpose_35 = attn_output_32.transpose(1, 2) + attn_output_32 = None + attn_output_33 = transpose_35.contiguous() + transpose_35 = None + reshape_8 = attn_output_33.reshape(1, 2, -1) + attn_output_33 = None + attn_output_34 = reshape_8.contiguous() + reshape_8 = None + attn_output_35 = torch._C._nn.linear( + attn_output_34, + l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_34 = l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_16 = torch.nn.functional.dropout(attn_output_35, 0.0, False, False) + attn_output_35 = None + hidden_states_75 = hidden_states_71 + dropout_16 + hidden_states_71 = dropout_16 = None + hidden_states_76 = hidden_states_75.to(torch.float32) + pow_18 = hidden_states_76.pow(2) + variance_17 = pow_18.mean(-1, keepdim=True) + pow_18 = None + add_52 = variance_17 + 1e-05 + variance_17 = None + rsqrt_17 = torch.rsqrt(add_52) + add_52 = None + hidden_states_77 = hidden_states_76 * rsqrt_17 + hidden_states_76 = rsqrt_17 = None + to_35 = hidden_states_77.to(torch.bfloat16) + hidden_states_77 = None + hidden_states_78 = ( + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + * to_35 + ) + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = ( + to_35 + ) = None + up_states_24 = torch._C._nn.linear( + hidden_states_78, + l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_78 = l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_8 = up_states_24.chunk(2, dim=-1) + up_states_24 = None + gate_8 = chunk_8[0] + up_states_25 = chunk_8[1] + chunk_8 = None + silu_8 = torch.nn.functional.silu(gate_8, inplace=False) + gate_8 = None + up_states_26 = up_states_25 * silu_8 + up_states_25 = silu_8 = None + hidden_states_79 = torch._C._nn.linear( + up_states_26, + l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_26 = l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_17 = torch.nn.functional.dropout(hidden_states_79, 0.0, False, False) + hidden_states_79 = None + hidden_states_80 = hidden_states_75 + dropout_17 + hidden_states_75 = dropout_17 = None + hidden_states_81 = hidden_states_80.to(torch.float32) + pow_19 = hidden_states_81.pow(2) + variance_18 = pow_19.mean(-1, keepdim=True) + pow_19 = None + add_54 = variance_18 + 1e-05 + variance_18 = None + rsqrt_18 = torch.rsqrt(add_54) + add_54 = None + hidden_states_82 = hidden_states_81 * rsqrt_18 + hidden_states_81 = rsqrt_18 = None + to_37 = hidden_states_82.to(torch.bfloat16) + hidden_states_82 = None + hidden_states_83 = ( + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + * to_37 + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + to_37 + ) = None + qkv_9 = torch._C._nn.linear( + hidden_states_83, + l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_83 = l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_18 = qkv_9[(Ellipsis, slice(None, 3072, None))] + key_states_18 = qkv_9[(Ellipsis, slice(3072, 6144, None))] + value_states_18 = qkv_9[(Ellipsis, slice(6144, None, None))] + qkv_9 = None + view_27 = query_states_18.view((1, 2, -1, 96)) + query_states_18 = None + query_states_19 = view_27.transpose(1, 2) + view_27 = None + view_28 = key_states_18.view((1, 2, -1, 96)) + key_states_18 = None + key_states_19 = view_28.transpose(1, 2) + view_28 = None + view_29 = value_states_18.view((1, 2, -1, 96)) + value_states_18 = None + value_states_19 = view_29.transpose(1, 2) + view_29 = None + cos_9 = l_stack0_0_.unsqueeze(1) + sin_9 = l_stack0_1_.unsqueeze(1) + q_rot_9 = query_states_19[(Ellipsis, slice(None, 96, None))] + q_pass_9 = query_states_19[(Ellipsis, slice(96, None, None))] + query_states_19 = None + k_rot_9 = key_states_19[(Ellipsis, slice(None, 96, None))] + k_pass_9 = key_states_19[(Ellipsis, slice(96, None, None))] + key_states_19 = None + mul_83 = q_rot_9 * cos_9 + x1_18 = q_rot_9[(Ellipsis, slice(None, 48, None))] + x2_18 = q_rot_9[(Ellipsis, slice(48, None, None))] + q_rot_9 = None + neg_18 = -x2_18 + x2_18 = None + cat_36 = torch.cat((neg_18, x1_18), dim=-1) + neg_18 = x1_18 = None + mul_84 = cat_36 * sin_9 + cat_36 = None + add_55 = mul_83 + mul_84 + mul_83 = mul_84 = None + q_embed_9 = torch.cat([add_55, q_pass_9], dim=-1) + add_55 = q_pass_9 = None + mul_85 = k_rot_9 * cos_9 + cos_9 = None + x1_19 = k_rot_9[(Ellipsis, slice(None, 48, None))] + x2_19 = k_rot_9[(Ellipsis, slice(48, None, None))] + k_rot_9 = None + neg_19 = -x2_19 + x2_19 = None + cat_38 = torch.cat((neg_19, x1_19), dim=-1) + neg_19 = x1_19 = None + mul_86 = cat_38 * sin_9 + cat_38 = sin_9 = None + add_56 = mul_85 + mul_86 + mul_85 = mul_86 = None + k_embed_9 = torch.cat([add_56, k_pass_9], dim=-1) + add_56 = k_pass_9 = None + attention_mask_9 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_9 = q_embed_9.contiguous() + q_embed_9 = None + key_9 = k_embed_9.contiguous() + value_9 = value_states_19.contiguous() + attn_output_36 = torch._C._nn.scaled_dot_product_attention( + query_9, + key_9, + value_9, + attn_mask=attention_mask_9, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_9 = key_9 = value_9 = attention_mask_9 = None + transpose_39 = attn_output_36.transpose(1, 2) + attn_output_36 = None + attn_output_37 = transpose_39.contiguous() + transpose_39 = None + reshape_9 = attn_output_37.reshape(1, 2, -1) + attn_output_37 = None + attn_output_38 = reshape_9.contiguous() + reshape_9 = None + attn_output_39 = torch._C._nn.linear( + attn_output_38, + l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_38 = l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_18 = torch.nn.functional.dropout(attn_output_39, 0.0, False, False) + attn_output_39 = None + hidden_states_84 = hidden_states_80 + dropout_18 + hidden_states_80 = dropout_18 = None + hidden_states_85 = hidden_states_84.to(torch.float32) + pow_20 = hidden_states_85.pow(2) + variance_19 = pow_20.mean(-1, keepdim=True) + pow_20 = None + add_58 = variance_19 + 1e-05 + variance_19 = None + rsqrt_19 = torch.rsqrt(add_58) + add_58 = None + hidden_states_86 = hidden_states_85 * rsqrt_19 + hidden_states_85 = rsqrt_19 = None + to_39 = hidden_states_86.to(torch.bfloat16) + hidden_states_86 = None + hidden_states_87 = ( + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + * to_39 + ) + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = ( + to_39 + ) = None + up_states_27 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_87 = l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_9 = up_states_27.chunk(2, dim=-1) + up_states_27 = None + gate_9 = chunk_9[0] + up_states_28 = chunk_9[1] + chunk_9 = None + silu_9 = torch.nn.functional.silu(gate_9, inplace=False) + gate_9 = None + up_states_29 = up_states_28 * silu_9 + up_states_28 = silu_9 = None + hidden_states_88 = torch._C._nn.linear( + up_states_29, + l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_29 = l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_19 = torch.nn.functional.dropout(hidden_states_88, 0.0, False, False) + hidden_states_88 = None + hidden_states_89 = hidden_states_84 + dropout_19 + hidden_states_84 = dropout_19 = None + hidden_states_90 = hidden_states_89.to(torch.float32) + pow_21 = hidden_states_90.pow(2) + variance_20 = pow_21.mean(-1, keepdim=True) + pow_21 = None + add_60 = variance_20 + 1e-05 + variance_20 = None + rsqrt_20 = torch.rsqrt(add_60) + add_60 = None + hidden_states_91 = hidden_states_90 * rsqrt_20 + hidden_states_90 = rsqrt_20 = None + to_41 = hidden_states_91.to(torch.bfloat16) + hidden_states_91 = None + hidden_states_92 = ( + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + * to_41 + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + to_41 + ) = None + qkv_10 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_92 = l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_20 = qkv_10[(Ellipsis, slice(None, 3072, None))] + key_states_20 = qkv_10[(Ellipsis, slice(3072, 6144, None))] + value_states_20 = qkv_10[(Ellipsis, slice(6144, None, None))] + qkv_10 = None + view_30 = query_states_20.view((1, 2, -1, 96)) + query_states_20 = None + query_states_21 = view_30.transpose(1, 2) + view_30 = None + view_31 = key_states_20.view((1, 2, -1, 96)) + key_states_20 = None + key_states_21 = view_31.transpose(1, 2) + view_31 = None + view_32 = value_states_20.view((1, 2, -1, 96)) + value_states_20 = None + value_states_21 = view_32.transpose(1, 2) + view_32 = None + cos_10 = l_stack0_0_.unsqueeze(1) + sin_10 = l_stack0_1_.unsqueeze(1) + q_rot_10 = query_states_21[(Ellipsis, slice(None, 96, None))] + q_pass_10 = query_states_21[(Ellipsis, slice(96, None, None))] + query_states_21 = None + k_rot_10 = key_states_21[(Ellipsis, slice(None, 96, None))] + k_pass_10 = key_states_21[(Ellipsis, slice(96, None, None))] + key_states_21 = None + mul_92 = q_rot_10 * cos_10 + x1_20 = q_rot_10[(Ellipsis, slice(None, 48, None))] + x2_20 = q_rot_10[(Ellipsis, slice(48, None, None))] + q_rot_10 = None + neg_20 = -x2_20 + x2_20 = None + cat_40 = torch.cat((neg_20, x1_20), dim=-1) + neg_20 = x1_20 = None + mul_93 = cat_40 * sin_10 + cat_40 = None + add_61 = mul_92 + mul_93 + mul_92 = mul_93 = None + q_embed_10 = torch.cat([add_61, q_pass_10], dim=-1) + add_61 = q_pass_10 = None + mul_94 = k_rot_10 * cos_10 + cos_10 = None + x1_21 = k_rot_10[(Ellipsis, slice(None, 48, None))] + x2_21 = k_rot_10[(Ellipsis, slice(48, None, None))] + k_rot_10 = None + neg_21 = -x2_21 + x2_21 = None + cat_42 = torch.cat((neg_21, x1_21), dim=-1) + neg_21 = x1_21 = None + mul_95 = cat_42 * sin_10 + cat_42 = sin_10 = None + add_62 = mul_94 + mul_95 + mul_94 = mul_95 = None + k_embed_10 = torch.cat([add_62, k_pass_10], dim=-1) + add_62 = k_pass_10 = None + attention_mask_10 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_10 = q_embed_10.contiguous() + q_embed_10 = None + key_10 = k_embed_10.contiguous() + value_10 = value_states_21.contiguous() + attn_output_40 = torch._C._nn.scaled_dot_product_attention( + query_10, + key_10, + value_10, + attn_mask=attention_mask_10, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_10 = key_10 = value_10 = attention_mask_10 = None + transpose_43 = attn_output_40.transpose(1, 2) + attn_output_40 = None + attn_output_41 = transpose_43.contiguous() + transpose_43 = None + reshape_10 = attn_output_41.reshape(1, 2, -1) + attn_output_41 = None + attn_output_42 = reshape_10.contiguous() + reshape_10 = None + attn_output_43 = torch._C._nn.linear( + attn_output_42, + l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_42 = l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_20 = torch.nn.functional.dropout(attn_output_43, 0.0, False, False) + attn_output_43 = None + hidden_states_93 = hidden_states_89 + dropout_20 + hidden_states_89 = dropout_20 = None + hidden_states_94 = hidden_states_93.to(torch.float32) + pow_22 = hidden_states_94.pow(2) + variance_21 = pow_22.mean(-1, keepdim=True) + pow_22 = None + add_64 = variance_21 + 1e-05 + variance_21 = None + rsqrt_21 = torch.rsqrt(add_64) + add_64 = None + hidden_states_95 = hidden_states_94 * rsqrt_21 + hidden_states_94 = rsqrt_21 = None + to_43 = hidden_states_95.to(torch.bfloat16) + hidden_states_95 = None + hidden_states_96 = ( + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + * to_43 + ) + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = ( + to_43 + ) = None + up_states_30 = torch._C._nn.linear( + hidden_states_96, + l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_96 = l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_10 = up_states_30.chunk(2, dim=-1) + up_states_30 = None + gate_10 = chunk_10[0] + up_states_31 = chunk_10[1] + chunk_10 = None + silu_10 = torch.nn.functional.silu(gate_10, inplace=False) + gate_10 = None + up_states_32 = up_states_31 * silu_10 + up_states_31 = silu_10 = None + hidden_states_97 = torch._C._nn.linear( + up_states_32, + l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_32 = l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_21 = torch.nn.functional.dropout(hidden_states_97, 0.0, False, False) + hidden_states_97 = None + hidden_states_98 = hidden_states_93 + dropout_21 + hidden_states_93 = dropout_21 = None + hidden_states_99 = hidden_states_98.to(torch.float32) + pow_23 = hidden_states_99.pow(2) + variance_22 = pow_23.mean(-1, keepdim=True) + pow_23 = None + add_66 = variance_22 + 1e-05 + variance_22 = None + rsqrt_22 = torch.rsqrt(add_66) + add_66 = None + hidden_states_100 = hidden_states_99 * rsqrt_22 + hidden_states_99 = rsqrt_22 = None + to_45 = hidden_states_100.to(torch.bfloat16) + hidden_states_100 = None + hidden_states_101 = ( + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + * to_45 + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + to_45 + ) = None + qkv_11 = torch._C._nn.linear( + hidden_states_101, + l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_101 = l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_22 = qkv_11[(Ellipsis, slice(None, 3072, None))] + key_states_22 = qkv_11[(Ellipsis, slice(3072, 6144, None))] + value_states_22 = qkv_11[(Ellipsis, slice(6144, None, None))] + qkv_11 = None + view_33 = query_states_22.view((1, 2, -1, 96)) + query_states_22 = None + query_states_23 = view_33.transpose(1, 2) + view_33 = None + view_34 = key_states_22.view((1, 2, -1, 96)) + key_states_22 = None + key_states_23 = view_34.transpose(1, 2) + view_34 = None + view_35 = value_states_22.view((1, 2, -1, 96)) + value_states_22 = None + value_states_23 = view_35.transpose(1, 2) + view_35 = None + cos_11 = l_stack0_0_.unsqueeze(1) + sin_11 = l_stack0_1_.unsqueeze(1) + q_rot_11 = query_states_23[(Ellipsis, slice(None, 96, None))] + q_pass_11 = query_states_23[(Ellipsis, slice(96, None, None))] + query_states_23 = None + k_rot_11 = key_states_23[(Ellipsis, slice(None, 96, None))] + k_pass_11 = key_states_23[(Ellipsis, slice(96, None, None))] + key_states_23 = None + mul_101 = q_rot_11 * cos_11 + x1_22 = q_rot_11[(Ellipsis, slice(None, 48, None))] + x2_22 = q_rot_11[(Ellipsis, slice(48, None, None))] + q_rot_11 = None + neg_22 = -x2_22 + x2_22 = None + cat_44 = torch.cat((neg_22, x1_22), dim=-1) + neg_22 = x1_22 = None + mul_102 = cat_44 * sin_11 + cat_44 = None + add_67 = mul_101 + mul_102 + mul_101 = mul_102 = None + q_embed_11 = torch.cat([add_67, q_pass_11], dim=-1) + add_67 = q_pass_11 = None + mul_103 = k_rot_11 * cos_11 + cos_11 = None + x1_23 = k_rot_11[(Ellipsis, slice(None, 48, None))] + x2_23 = k_rot_11[(Ellipsis, slice(48, None, None))] + k_rot_11 = None + neg_23 = -x2_23 + x2_23 = None + cat_46 = torch.cat((neg_23, x1_23), dim=-1) + neg_23 = x1_23 = None + mul_104 = cat_46 * sin_11 + cat_46 = sin_11 = None + add_68 = mul_103 + mul_104 + mul_103 = mul_104 = None + k_embed_11 = torch.cat([add_68, k_pass_11], dim=-1) + add_68 = k_pass_11 = None + attention_mask_11 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_11 = q_embed_11.contiguous() + q_embed_11 = None + key_11 = k_embed_11.contiguous() + value_11 = value_states_23.contiguous() + attn_output_44 = torch._C._nn.scaled_dot_product_attention( + query_11, + key_11, + value_11, + attn_mask=attention_mask_11, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_11 = key_11 = value_11 = attention_mask_11 = None + transpose_47 = attn_output_44.transpose(1, 2) + attn_output_44 = None + attn_output_45 = transpose_47.contiguous() + transpose_47 = None + reshape_11 = attn_output_45.reshape(1, 2, -1) + attn_output_45 = None + attn_output_46 = reshape_11.contiguous() + reshape_11 = None + attn_output_47 = torch._C._nn.linear( + attn_output_46, + l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_46 = l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_22 = torch.nn.functional.dropout(attn_output_47, 0.0, False, False) + attn_output_47 = None + hidden_states_102 = hidden_states_98 + dropout_22 + hidden_states_98 = dropout_22 = None + hidden_states_103 = hidden_states_102.to(torch.float32) + pow_24 = hidden_states_103.pow(2) + variance_23 = pow_24.mean(-1, keepdim=True) + pow_24 = None + add_70 = variance_23 + 1e-05 + variance_23 = None + rsqrt_23 = torch.rsqrt(add_70) + add_70 = None + hidden_states_104 = hidden_states_103 * rsqrt_23 + hidden_states_103 = rsqrt_23 = None + to_47 = hidden_states_104.to(torch.bfloat16) + hidden_states_104 = None + hidden_states_105 = ( + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + * to_47 + ) + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = ( + to_47 + ) = None + up_states_33 = torch._C._nn.linear( + hidden_states_105, + l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_105 = l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_11 = up_states_33.chunk(2, dim=-1) + up_states_33 = None + gate_11 = chunk_11[0] + up_states_34 = chunk_11[1] + chunk_11 = None + silu_11 = torch.nn.functional.silu(gate_11, inplace=False) + gate_11 = None + up_states_35 = up_states_34 * silu_11 + up_states_34 = silu_11 = None + hidden_states_106 = torch._C._nn.linear( + up_states_35, + l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_35 = l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_23 = torch.nn.functional.dropout(hidden_states_106, 0.0, False, False) + hidden_states_106 = None + hidden_states_107 = hidden_states_102 + dropout_23 + hidden_states_102 = dropout_23 = None + hidden_states_108 = hidden_states_107.to(torch.float32) + pow_25 = hidden_states_108.pow(2) + variance_24 = pow_25.mean(-1, keepdim=True) + pow_25 = None + add_72 = variance_24 + 1e-05 + variance_24 = None + rsqrt_24 = torch.rsqrt(add_72) + add_72 = None + hidden_states_109 = hidden_states_108 * rsqrt_24 + hidden_states_108 = rsqrt_24 = None + to_49 = hidden_states_109.to(torch.bfloat16) + hidden_states_109 = None + hidden_states_110 = ( + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + * to_49 + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + to_49 + ) = None + qkv_12 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_110 = l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_24 = qkv_12[(Ellipsis, slice(None, 3072, None))] + key_states_24 = qkv_12[(Ellipsis, slice(3072, 6144, None))] + value_states_24 = qkv_12[(Ellipsis, slice(6144, None, None))] + qkv_12 = None + view_36 = query_states_24.view((1, 2, -1, 96)) + query_states_24 = None + query_states_25 = view_36.transpose(1, 2) + view_36 = None + view_37 = key_states_24.view((1, 2, -1, 96)) + key_states_24 = None + key_states_25 = view_37.transpose(1, 2) + view_37 = None + view_38 = value_states_24.view((1, 2, -1, 96)) + value_states_24 = None + value_states_25 = view_38.transpose(1, 2) + view_38 = None + cos_12 = l_stack0_0_.unsqueeze(1) + sin_12 = l_stack0_1_.unsqueeze(1) + q_rot_12 = query_states_25[(Ellipsis, slice(None, 96, None))] + q_pass_12 = query_states_25[(Ellipsis, slice(96, None, None))] + query_states_25 = None + k_rot_12 = key_states_25[(Ellipsis, slice(None, 96, None))] + k_pass_12 = key_states_25[(Ellipsis, slice(96, None, None))] + key_states_25 = None + mul_110 = q_rot_12 * cos_12 + x1_24 = q_rot_12[(Ellipsis, slice(None, 48, None))] + x2_24 = q_rot_12[(Ellipsis, slice(48, None, None))] + q_rot_12 = None + neg_24 = -x2_24 + x2_24 = None + cat_48 = torch.cat((neg_24, x1_24), dim=-1) + neg_24 = x1_24 = None + mul_111 = cat_48 * sin_12 + cat_48 = None + add_73 = mul_110 + mul_111 + mul_110 = mul_111 = None + q_embed_12 = torch.cat([add_73, q_pass_12], dim=-1) + add_73 = q_pass_12 = None + mul_112 = k_rot_12 * cos_12 + cos_12 = None + x1_25 = k_rot_12[(Ellipsis, slice(None, 48, None))] + x2_25 = k_rot_12[(Ellipsis, slice(48, None, None))] + k_rot_12 = None + neg_25 = -x2_25 + x2_25 = None + cat_50 = torch.cat((neg_25, x1_25), dim=-1) + neg_25 = x1_25 = None + mul_113 = cat_50 * sin_12 + cat_50 = sin_12 = None + add_74 = mul_112 + mul_113 + mul_112 = mul_113 = None + k_embed_12 = torch.cat([add_74, k_pass_12], dim=-1) + add_74 = k_pass_12 = None + attention_mask_12 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_12 = q_embed_12.contiguous() + q_embed_12 = None + key_12 = k_embed_12.contiguous() + value_12 = value_states_25.contiguous() + attn_output_48 = torch._C._nn.scaled_dot_product_attention( + query_12, + key_12, + value_12, + attn_mask=attention_mask_12, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_12 = key_12 = value_12 = attention_mask_12 = None + transpose_51 = attn_output_48.transpose(1, 2) + attn_output_48 = None + attn_output_49 = transpose_51.contiguous() + transpose_51 = None + reshape_12 = attn_output_49.reshape(1, 2, -1) + attn_output_49 = None + attn_output_50 = reshape_12.contiguous() + reshape_12 = None + attn_output_51 = torch._C._nn.linear( + attn_output_50, + l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_50 = l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_24 = torch.nn.functional.dropout(attn_output_51, 0.0, False, False) + attn_output_51 = None + hidden_states_111 = hidden_states_107 + dropout_24 + hidden_states_107 = dropout_24 = None + hidden_states_112 = hidden_states_111.to(torch.float32) + pow_26 = hidden_states_112.pow(2) + variance_25 = pow_26.mean(-1, keepdim=True) + pow_26 = None + add_76 = variance_25 + 1e-05 + variance_25 = None + rsqrt_25 = torch.rsqrt(add_76) + add_76 = None + hidden_states_113 = hidden_states_112 * rsqrt_25 + hidden_states_112 = rsqrt_25 = None + to_51 = hidden_states_113.to(torch.bfloat16) + hidden_states_113 = None + hidden_states_114 = ( + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + * to_51 + ) + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = ( + to_51 + ) = None + up_states_36 = torch._C._nn.linear( + hidden_states_114, + l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_114 = l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_12 = up_states_36.chunk(2, dim=-1) + up_states_36 = None + gate_12 = chunk_12[0] + up_states_37 = chunk_12[1] + chunk_12 = None + silu_12 = torch.nn.functional.silu(gate_12, inplace=False) + gate_12 = None + up_states_38 = up_states_37 * silu_12 + up_states_37 = silu_12 = None + hidden_states_115 = torch._C._nn.linear( + up_states_38, + l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_38 = l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_25 = torch.nn.functional.dropout(hidden_states_115, 0.0, False, False) + hidden_states_115 = None + hidden_states_116 = hidden_states_111 + dropout_25 + hidden_states_111 = dropout_25 = None + hidden_states_117 = hidden_states_116.to(torch.float32) + pow_27 = hidden_states_117.pow(2) + variance_26 = pow_27.mean(-1, keepdim=True) + pow_27 = None + add_78 = variance_26 + 1e-05 + variance_26 = None + rsqrt_26 = torch.rsqrt(add_78) + add_78 = None + hidden_states_118 = hidden_states_117 * rsqrt_26 + hidden_states_117 = rsqrt_26 = None + to_53 = hidden_states_118.to(torch.bfloat16) + hidden_states_118 = None + hidden_states_119 = ( + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + * to_53 + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + to_53 + ) = None + qkv_13 = torch._C._nn.linear( + hidden_states_119, + l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_119 = l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_26 = qkv_13[(Ellipsis, slice(None, 3072, None))] + key_states_26 = qkv_13[(Ellipsis, slice(3072, 6144, None))] + value_states_26 = qkv_13[(Ellipsis, slice(6144, None, None))] + qkv_13 = None + view_39 = query_states_26.view((1, 2, -1, 96)) + query_states_26 = None + query_states_27 = view_39.transpose(1, 2) + view_39 = None + view_40 = key_states_26.view((1, 2, -1, 96)) + key_states_26 = None + key_states_27 = view_40.transpose(1, 2) + view_40 = None + view_41 = value_states_26.view((1, 2, -1, 96)) + value_states_26 = None + value_states_27 = view_41.transpose(1, 2) + view_41 = None + cos_13 = l_stack0_0_.unsqueeze(1) + sin_13 = l_stack0_1_.unsqueeze(1) + q_rot_13 = query_states_27[(Ellipsis, slice(None, 96, None))] + q_pass_13 = query_states_27[(Ellipsis, slice(96, None, None))] + query_states_27 = None + k_rot_13 = key_states_27[(Ellipsis, slice(None, 96, None))] + k_pass_13 = key_states_27[(Ellipsis, slice(96, None, None))] + key_states_27 = None + mul_119 = q_rot_13 * cos_13 + x1_26 = q_rot_13[(Ellipsis, slice(None, 48, None))] + x2_26 = q_rot_13[(Ellipsis, slice(48, None, None))] + q_rot_13 = None + neg_26 = -x2_26 + x2_26 = None + cat_52 = torch.cat((neg_26, x1_26), dim=-1) + neg_26 = x1_26 = None + mul_120 = cat_52 * sin_13 + cat_52 = None + add_79 = mul_119 + mul_120 + mul_119 = mul_120 = None + q_embed_13 = torch.cat([add_79, q_pass_13], dim=-1) + add_79 = q_pass_13 = None + mul_121 = k_rot_13 * cos_13 + cos_13 = None + x1_27 = k_rot_13[(Ellipsis, slice(None, 48, None))] + x2_27 = k_rot_13[(Ellipsis, slice(48, None, None))] + k_rot_13 = None + neg_27 = -x2_27 + x2_27 = None + cat_54 = torch.cat((neg_27, x1_27), dim=-1) + neg_27 = x1_27 = None + mul_122 = cat_54 * sin_13 + cat_54 = sin_13 = None + add_80 = mul_121 + mul_122 + mul_121 = mul_122 = None + k_embed_13 = torch.cat([add_80, k_pass_13], dim=-1) + add_80 = k_pass_13 = None + attention_mask_13 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_13 = q_embed_13.contiguous() + q_embed_13 = None + key_13 = k_embed_13.contiguous() + value_13 = value_states_27.contiguous() + attn_output_52 = torch._C._nn.scaled_dot_product_attention( + query_13, + key_13, + value_13, + attn_mask=attention_mask_13, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_13 = key_13 = value_13 = attention_mask_13 = None + transpose_55 = attn_output_52.transpose(1, 2) + attn_output_52 = None + attn_output_53 = transpose_55.contiguous() + transpose_55 = None + reshape_13 = attn_output_53.reshape(1, 2, -1) + attn_output_53 = None + attn_output_54 = reshape_13.contiguous() + reshape_13 = None + attn_output_55 = torch._C._nn.linear( + attn_output_54, + l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_54 = l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_26 = torch.nn.functional.dropout(attn_output_55, 0.0, False, False) + attn_output_55 = None + hidden_states_120 = hidden_states_116 + dropout_26 + hidden_states_116 = dropout_26 = None + hidden_states_121 = hidden_states_120.to(torch.float32) + pow_28 = hidden_states_121.pow(2) + variance_27 = pow_28.mean(-1, keepdim=True) + pow_28 = None + add_82 = variance_27 + 1e-05 + variance_27 = None + rsqrt_27 = torch.rsqrt(add_82) + add_82 = None + hidden_states_122 = hidden_states_121 * rsqrt_27 + hidden_states_121 = rsqrt_27 = None + to_55 = hidden_states_122.to(torch.bfloat16) + hidden_states_122 = None + hidden_states_123 = ( + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + * to_55 + ) + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = ( + to_55 + ) = None + up_states_39 = torch._C._nn.linear( + hidden_states_123, + l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_123 = l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_13 = up_states_39.chunk(2, dim=-1) + up_states_39 = None + gate_13 = chunk_13[0] + up_states_40 = chunk_13[1] + chunk_13 = None + silu_13 = torch.nn.functional.silu(gate_13, inplace=False) + gate_13 = None + up_states_41 = up_states_40 * silu_13 + up_states_40 = silu_13 = None + hidden_states_124 = torch._C._nn.linear( + up_states_41, + l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_41 = l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_27 = torch.nn.functional.dropout(hidden_states_124, 0.0, False, False) + hidden_states_124 = None + hidden_states_125 = hidden_states_120 + dropout_27 + hidden_states_120 = dropout_27 = None + hidden_states_126 = hidden_states_125.to(torch.float32) + pow_29 = hidden_states_126.pow(2) + variance_28 = pow_29.mean(-1, keepdim=True) + pow_29 = None + add_84 = variance_28 + 1e-05 + variance_28 = None + rsqrt_28 = torch.rsqrt(add_84) + add_84 = None + hidden_states_127 = hidden_states_126 * rsqrt_28 + hidden_states_126 = rsqrt_28 = None + to_57 = hidden_states_127.to(torch.bfloat16) + hidden_states_127 = None + hidden_states_128 = ( + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + * to_57 + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + to_57 + ) = None + qkv_14 = torch._C._nn.linear( + hidden_states_128, + l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_128 = l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_28 = qkv_14[(Ellipsis, slice(None, 3072, None))] + key_states_28 = qkv_14[(Ellipsis, slice(3072, 6144, None))] + value_states_28 = qkv_14[(Ellipsis, slice(6144, None, None))] + qkv_14 = None + view_42 = query_states_28.view((1, 2, -1, 96)) + query_states_28 = None + query_states_29 = view_42.transpose(1, 2) + view_42 = None + view_43 = key_states_28.view((1, 2, -1, 96)) + key_states_28 = None + key_states_29 = view_43.transpose(1, 2) + view_43 = None + view_44 = value_states_28.view((1, 2, -1, 96)) + value_states_28 = None + value_states_29 = view_44.transpose(1, 2) + view_44 = None + cos_14 = l_stack0_0_.unsqueeze(1) + sin_14 = l_stack0_1_.unsqueeze(1) + q_rot_14 = query_states_29[(Ellipsis, slice(None, 96, None))] + q_pass_14 = query_states_29[(Ellipsis, slice(96, None, None))] + query_states_29 = None + k_rot_14 = key_states_29[(Ellipsis, slice(None, 96, None))] + k_pass_14 = key_states_29[(Ellipsis, slice(96, None, None))] + key_states_29 = None + mul_128 = q_rot_14 * cos_14 + x1_28 = q_rot_14[(Ellipsis, slice(None, 48, None))] + x2_28 = q_rot_14[(Ellipsis, slice(48, None, None))] + q_rot_14 = None + neg_28 = -x2_28 + x2_28 = None + cat_56 = torch.cat((neg_28, x1_28), dim=-1) + neg_28 = x1_28 = None + mul_129 = cat_56 * sin_14 + cat_56 = None + add_85 = mul_128 + mul_129 + mul_128 = mul_129 = None + q_embed_14 = torch.cat([add_85, q_pass_14], dim=-1) + add_85 = q_pass_14 = None + mul_130 = k_rot_14 * cos_14 + cos_14 = None + x1_29 = k_rot_14[(Ellipsis, slice(None, 48, None))] + x2_29 = k_rot_14[(Ellipsis, slice(48, None, None))] + k_rot_14 = None + neg_29 = -x2_29 + x2_29 = None + cat_58 = torch.cat((neg_29, x1_29), dim=-1) + neg_29 = x1_29 = None + mul_131 = cat_58 * sin_14 + cat_58 = sin_14 = None + add_86 = mul_130 + mul_131 + mul_130 = mul_131 = None + k_embed_14 = torch.cat([add_86, k_pass_14], dim=-1) + add_86 = k_pass_14 = None + attention_mask_14 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_14 = q_embed_14.contiguous() + q_embed_14 = None + key_14 = k_embed_14.contiguous() + value_14 = value_states_29.contiguous() + attn_output_56 = torch._C._nn.scaled_dot_product_attention( + query_14, + key_14, + value_14, + attn_mask=attention_mask_14, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_14 = key_14 = value_14 = attention_mask_14 = None + transpose_59 = attn_output_56.transpose(1, 2) + attn_output_56 = None + attn_output_57 = transpose_59.contiguous() + transpose_59 = None + reshape_14 = attn_output_57.reshape(1, 2, -1) + attn_output_57 = None + attn_output_58 = reshape_14.contiguous() + reshape_14 = None + attn_output_59 = torch._C._nn.linear( + attn_output_58, + l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_58 = l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_28 = torch.nn.functional.dropout(attn_output_59, 0.0, False, False) + attn_output_59 = None + hidden_states_129 = hidden_states_125 + dropout_28 + hidden_states_125 = dropout_28 = None + hidden_states_130 = hidden_states_129.to(torch.float32) + pow_30 = hidden_states_130.pow(2) + variance_29 = pow_30.mean(-1, keepdim=True) + pow_30 = None + add_88 = variance_29 + 1e-05 + variance_29 = None + rsqrt_29 = torch.rsqrt(add_88) + add_88 = None + hidden_states_131 = hidden_states_130 * rsqrt_29 + hidden_states_130 = rsqrt_29 = None + to_59 = hidden_states_131.to(torch.bfloat16) + hidden_states_131 = None + hidden_states_132 = ( + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + * to_59 + ) + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = ( + to_59 + ) = None + up_states_42 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_132 = l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_14 = up_states_42.chunk(2, dim=-1) + up_states_42 = None + gate_14 = chunk_14[0] + up_states_43 = chunk_14[1] + chunk_14 = None + silu_14 = torch.nn.functional.silu(gate_14, inplace=False) + gate_14 = None + up_states_44 = up_states_43 * silu_14 + up_states_43 = silu_14 = None + hidden_states_133 = torch._C._nn.linear( + up_states_44, + l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_44 = l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_29 = torch.nn.functional.dropout(hidden_states_133, 0.0, False, False) + hidden_states_133 = None + hidden_states_134 = hidden_states_129 + dropout_29 + hidden_states_129 = dropout_29 = None + hidden_states_135 = hidden_states_134.to(torch.float32) + pow_31 = hidden_states_135.pow(2) + variance_30 = pow_31.mean(-1, keepdim=True) + pow_31 = None + add_90 = variance_30 + 1e-05 + variance_30 = None + rsqrt_30 = torch.rsqrt(add_90) + add_90 = None + hidden_states_136 = hidden_states_135 * rsqrt_30 + hidden_states_135 = rsqrt_30 = None + to_61 = hidden_states_136.to(torch.bfloat16) + hidden_states_136 = None + hidden_states_137 = ( + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + * to_61 + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + to_61 + ) = None + qkv_15 = torch._C._nn.linear( + hidden_states_137, + l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_137 = l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_30 = qkv_15[(Ellipsis, slice(None, 3072, None))] + key_states_30 = qkv_15[(Ellipsis, slice(3072, 6144, None))] + value_states_30 = qkv_15[(Ellipsis, slice(6144, None, None))] + qkv_15 = None + view_45 = query_states_30.view((1, 2, -1, 96)) + query_states_30 = None + query_states_31 = view_45.transpose(1, 2) + view_45 = None + view_46 = key_states_30.view((1, 2, -1, 96)) + key_states_30 = None + key_states_31 = view_46.transpose(1, 2) + view_46 = None + view_47 = value_states_30.view((1, 2, -1, 96)) + value_states_30 = None + value_states_31 = view_47.transpose(1, 2) + view_47 = None + cos_15 = l_stack0_0_.unsqueeze(1) + sin_15 = l_stack0_1_.unsqueeze(1) + q_rot_15 = query_states_31[(Ellipsis, slice(None, 96, None))] + q_pass_15 = query_states_31[(Ellipsis, slice(96, None, None))] + query_states_31 = None + k_rot_15 = key_states_31[(Ellipsis, slice(None, 96, None))] + k_pass_15 = key_states_31[(Ellipsis, slice(96, None, None))] + key_states_31 = None + mul_137 = q_rot_15 * cos_15 + x1_30 = q_rot_15[(Ellipsis, slice(None, 48, None))] + x2_30 = q_rot_15[(Ellipsis, slice(48, None, None))] + q_rot_15 = None + neg_30 = -x2_30 + x2_30 = None + cat_60 = torch.cat((neg_30, x1_30), dim=-1) + neg_30 = x1_30 = None + mul_138 = cat_60 * sin_15 + cat_60 = None + add_91 = mul_137 + mul_138 + mul_137 = mul_138 = None + q_embed_15 = torch.cat([add_91, q_pass_15], dim=-1) + add_91 = q_pass_15 = None + mul_139 = k_rot_15 * cos_15 + cos_15 = None + x1_31 = k_rot_15[(Ellipsis, slice(None, 48, None))] + x2_31 = k_rot_15[(Ellipsis, slice(48, None, None))] + k_rot_15 = None + neg_31 = -x2_31 + x2_31 = None + cat_62 = torch.cat((neg_31, x1_31), dim=-1) + neg_31 = x1_31 = None + mul_140 = cat_62 * sin_15 + cat_62 = sin_15 = None + add_92 = mul_139 + mul_140 + mul_139 = mul_140 = None + k_embed_15 = torch.cat([add_92, k_pass_15], dim=-1) + add_92 = k_pass_15 = None + attention_mask_15 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_15 = q_embed_15.contiguous() + q_embed_15 = None + key_15 = k_embed_15.contiguous() + value_15 = value_states_31.contiguous() + attn_output_60 = torch._C._nn.scaled_dot_product_attention( + query_15, + key_15, + value_15, + attn_mask=attention_mask_15, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_15 = key_15 = value_15 = attention_mask_15 = None + transpose_63 = attn_output_60.transpose(1, 2) + attn_output_60 = None + attn_output_61 = transpose_63.contiguous() + transpose_63 = None + reshape_15 = attn_output_61.reshape(1, 2, -1) + attn_output_61 = None + attn_output_62 = reshape_15.contiguous() + reshape_15 = None + attn_output_63 = torch._C._nn.linear( + attn_output_62, + l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_62 = l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_30 = torch.nn.functional.dropout(attn_output_63, 0.0, False, False) + attn_output_63 = None + hidden_states_138 = hidden_states_134 + dropout_30 + hidden_states_134 = dropout_30 = None + hidden_states_139 = hidden_states_138.to(torch.float32) + pow_32 = hidden_states_139.pow(2) + variance_31 = pow_32.mean(-1, keepdim=True) + pow_32 = None + add_94 = variance_31 + 1e-05 + variance_31 = None + rsqrt_31 = torch.rsqrt(add_94) + add_94 = None + hidden_states_140 = hidden_states_139 * rsqrt_31 + hidden_states_139 = rsqrt_31 = None + to_63 = hidden_states_140.to(torch.bfloat16) + hidden_states_140 = None + hidden_states_141 = ( + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + * to_63 + ) + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = ( + to_63 + ) = None + up_states_45 = torch._C._nn.linear( + hidden_states_141, + l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_141 = l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_15 = up_states_45.chunk(2, dim=-1) + up_states_45 = None + gate_15 = chunk_15[0] + up_states_46 = chunk_15[1] + chunk_15 = None + silu_15 = torch.nn.functional.silu(gate_15, inplace=False) + gate_15 = None + up_states_47 = up_states_46 * silu_15 + up_states_46 = silu_15 = None + hidden_states_142 = torch._C._nn.linear( + up_states_47, + l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_47 = l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_31 = torch.nn.functional.dropout(hidden_states_142, 0.0, False, False) + hidden_states_142 = None + hidden_states_143 = hidden_states_138 + dropout_31 + hidden_states_138 = dropout_31 = None + hidden_states_144 = hidden_states_143.to(torch.float32) + pow_33 = hidden_states_144.pow(2) + variance_32 = pow_33.mean(-1, keepdim=True) + pow_33 = None + add_96 = variance_32 + 1e-05 + variance_32 = None + rsqrt_32 = torch.rsqrt(add_96) + add_96 = None + hidden_states_145 = hidden_states_144 * rsqrt_32 + hidden_states_144 = rsqrt_32 = None + to_65 = hidden_states_145.to(torch.bfloat16) + hidden_states_145 = None + hidden_states_146 = ( + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + * to_65 + ) + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + to_65 + ) = None + qkv_16 = torch._C._nn.linear( + hidden_states_146, + l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_146 = l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_32 = qkv_16[(Ellipsis, slice(None, 3072, None))] + key_states_32 = qkv_16[(Ellipsis, slice(3072, 6144, None))] + value_states_32 = qkv_16[(Ellipsis, slice(6144, None, None))] + qkv_16 = None + view_48 = query_states_32.view((1, 2, -1, 96)) + query_states_32 = None + query_states_33 = view_48.transpose(1, 2) + view_48 = None + view_49 = key_states_32.view((1, 2, -1, 96)) + key_states_32 = None + key_states_33 = view_49.transpose(1, 2) + view_49 = None + view_50 = value_states_32.view((1, 2, -1, 96)) + value_states_32 = None + value_states_33 = view_50.transpose(1, 2) + view_50 = None + cos_16 = l_stack0_0_.unsqueeze(1) + sin_16 = l_stack0_1_.unsqueeze(1) + q_rot_16 = query_states_33[(Ellipsis, slice(None, 96, None))] + q_pass_16 = query_states_33[(Ellipsis, slice(96, None, None))] + query_states_33 = None + k_rot_16 = key_states_33[(Ellipsis, slice(None, 96, None))] + k_pass_16 = key_states_33[(Ellipsis, slice(96, None, None))] + key_states_33 = None + mul_146 = q_rot_16 * cos_16 + x1_32 = q_rot_16[(Ellipsis, slice(None, 48, None))] + x2_32 = q_rot_16[(Ellipsis, slice(48, None, None))] + q_rot_16 = None + neg_32 = -x2_32 + x2_32 = None + cat_64 = torch.cat((neg_32, x1_32), dim=-1) + neg_32 = x1_32 = None + mul_147 = cat_64 * sin_16 + cat_64 = None + add_97 = mul_146 + mul_147 + mul_146 = mul_147 = None + q_embed_16 = torch.cat([add_97, q_pass_16], dim=-1) + add_97 = q_pass_16 = None + mul_148 = k_rot_16 * cos_16 + cos_16 = None + x1_33 = k_rot_16[(Ellipsis, slice(None, 48, None))] + x2_33 = k_rot_16[(Ellipsis, slice(48, None, None))] + k_rot_16 = None + neg_33 = -x2_33 + x2_33 = None + cat_66 = torch.cat((neg_33, x1_33), dim=-1) + neg_33 = x1_33 = None + mul_149 = cat_66 * sin_16 + cat_66 = sin_16 = None + add_98 = mul_148 + mul_149 + mul_148 = mul_149 = None + k_embed_16 = torch.cat([add_98, k_pass_16], dim=-1) + add_98 = k_pass_16 = None + attention_mask_16 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_16 = q_embed_16.contiguous() + q_embed_16 = None + key_16 = k_embed_16.contiguous() + value_16 = value_states_33.contiguous() + attn_output_64 = torch._C._nn.scaled_dot_product_attention( + query_16, + key_16, + value_16, + attn_mask=attention_mask_16, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_16 = key_16 = value_16 = attention_mask_16 = None + transpose_67 = attn_output_64.transpose(1, 2) + attn_output_64 = None + attn_output_65 = transpose_67.contiguous() + transpose_67 = None + reshape_16 = attn_output_65.reshape(1, 2, -1) + attn_output_65 = None + attn_output_66 = reshape_16.contiguous() + reshape_16 = None + attn_output_67 = torch._C._nn.linear( + attn_output_66, + l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_66 = l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_32 = torch.nn.functional.dropout(attn_output_67, 0.0, False, False) + attn_output_67 = None + hidden_states_147 = hidden_states_143 + dropout_32 + hidden_states_143 = dropout_32 = None + hidden_states_148 = hidden_states_147.to(torch.float32) + pow_34 = hidden_states_148.pow(2) + variance_33 = pow_34.mean(-1, keepdim=True) + pow_34 = None + add_100 = variance_33 + 1e-05 + variance_33 = None + rsqrt_33 = torch.rsqrt(add_100) + add_100 = None + hidden_states_149 = hidden_states_148 * rsqrt_33 + hidden_states_148 = rsqrt_33 = None + to_67 = hidden_states_149.to(torch.bfloat16) + hidden_states_149 = None + hidden_states_150 = ( + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + * to_67 + ) + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = ( + to_67 + ) = None + up_states_48 = torch._C._nn.linear( + hidden_states_150, + l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_150 = l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_16 = up_states_48.chunk(2, dim=-1) + up_states_48 = None + gate_16 = chunk_16[0] + up_states_49 = chunk_16[1] + chunk_16 = None + silu_16 = torch.nn.functional.silu(gate_16, inplace=False) + gate_16 = None + up_states_50 = up_states_49 * silu_16 + up_states_49 = silu_16 = None + hidden_states_151 = torch._C._nn.linear( + up_states_50, + l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_50 = l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_33 = torch.nn.functional.dropout(hidden_states_151, 0.0, False, False) + hidden_states_151 = None + hidden_states_152 = hidden_states_147 + dropout_33 + hidden_states_147 = dropout_33 = None + hidden_states_153 = hidden_states_152.to(torch.float32) + pow_35 = hidden_states_153.pow(2) + variance_34 = pow_35.mean(-1, keepdim=True) + pow_35 = None + add_102 = variance_34 + 1e-05 + variance_34 = None + rsqrt_34 = torch.rsqrt(add_102) + add_102 = None + hidden_states_154 = hidden_states_153 * rsqrt_34 + hidden_states_153 = rsqrt_34 = None + to_69 = hidden_states_154.to(torch.bfloat16) + hidden_states_154 = None + hidden_states_155 = ( + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + * to_69 + ) + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + to_69 + ) = None + qkv_17 = torch._C._nn.linear( + hidden_states_155, + l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_155 = l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_34 = qkv_17[(Ellipsis, slice(None, 3072, None))] + key_states_34 = qkv_17[(Ellipsis, slice(3072, 6144, None))] + value_states_34 = qkv_17[(Ellipsis, slice(6144, None, None))] + qkv_17 = None + view_51 = query_states_34.view((1, 2, -1, 96)) + query_states_34 = None + query_states_35 = view_51.transpose(1, 2) + view_51 = None + view_52 = key_states_34.view((1, 2, -1, 96)) + key_states_34 = None + key_states_35 = view_52.transpose(1, 2) + view_52 = None + view_53 = value_states_34.view((1, 2, -1, 96)) + value_states_34 = None + value_states_35 = view_53.transpose(1, 2) + view_53 = None + cos_17 = l_stack0_0_.unsqueeze(1) + sin_17 = l_stack0_1_.unsqueeze(1) + q_rot_17 = query_states_35[(Ellipsis, slice(None, 96, None))] + q_pass_17 = query_states_35[(Ellipsis, slice(96, None, None))] + query_states_35 = None + k_rot_17 = key_states_35[(Ellipsis, slice(None, 96, None))] + k_pass_17 = key_states_35[(Ellipsis, slice(96, None, None))] + key_states_35 = None + mul_155 = q_rot_17 * cos_17 + x1_34 = q_rot_17[(Ellipsis, slice(None, 48, None))] + x2_34 = q_rot_17[(Ellipsis, slice(48, None, None))] + q_rot_17 = None + neg_34 = -x2_34 + x2_34 = None + cat_68 = torch.cat((neg_34, x1_34), dim=-1) + neg_34 = x1_34 = None + mul_156 = cat_68 * sin_17 + cat_68 = None + add_103 = mul_155 + mul_156 + mul_155 = mul_156 = None + q_embed_17 = torch.cat([add_103, q_pass_17], dim=-1) + add_103 = q_pass_17 = None + mul_157 = k_rot_17 * cos_17 + cos_17 = None + x1_35 = k_rot_17[(Ellipsis, slice(None, 48, None))] + x2_35 = k_rot_17[(Ellipsis, slice(48, None, None))] + k_rot_17 = None + neg_35 = -x2_35 + x2_35 = None + cat_70 = torch.cat((neg_35, x1_35), dim=-1) + neg_35 = x1_35 = None + mul_158 = cat_70 * sin_17 + cat_70 = sin_17 = None + add_104 = mul_157 + mul_158 + mul_157 = mul_158 = None + k_embed_17 = torch.cat([add_104, k_pass_17], dim=-1) + add_104 = k_pass_17 = None + attention_mask_17 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_17 = q_embed_17.contiguous() + q_embed_17 = None + key_17 = k_embed_17.contiguous() + value_17 = value_states_35.contiguous() + attn_output_68 = torch._C._nn.scaled_dot_product_attention( + query_17, + key_17, + value_17, + attn_mask=attention_mask_17, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_17 = key_17 = value_17 = attention_mask_17 = None + transpose_71 = attn_output_68.transpose(1, 2) + attn_output_68 = None + attn_output_69 = transpose_71.contiguous() + transpose_71 = None + reshape_17 = attn_output_69.reshape(1, 2, -1) + attn_output_69 = None + attn_output_70 = reshape_17.contiguous() + reshape_17 = None + attn_output_71 = torch._C._nn.linear( + attn_output_70, + l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_70 = l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_34 = torch.nn.functional.dropout(attn_output_71, 0.0, False, False) + attn_output_71 = None + hidden_states_156 = hidden_states_152 + dropout_34 + hidden_states_152 = dropout_34 = None + hidden_states_157 = hidden_states_156.to(torch.float32) + pow_36 = hidden_states_157.pow(2) + variance_35 = pow_36.mean(-1, keepdim=True) + pow_36 = None + add_106 = variance_35 + 1e-05 + variance_35 = None + rsqrt_35 = torch.rsqrt(add_106) + add_106 = None + hidden_states_158 = hidden_states_157 * rsqrt_35 + hidden_states_157 = rsqrt_35 = None + to_71 = hidden_states_158.to(torch.bfloat16) + hidden_states_158 = None + hidden_states_159 = ( + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + * to_71 + ) + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = ( + to_71 + ) = None + up_states_51 = torch._C._nn.linear( + hidden_states_159, + l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_159 = l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_17 = up_states_51.chunk(2, dim=-1) + up_states_51 = None + gate_17 = chunk_17[0] + up_states_52 = chunk_17[1] + chunk_17 = None + silu_17 = torch.nn.functional.silu(gate_17, inplace=False) + gate_17 = None + up_states_53 = up_states_52 * silu_17 + up_states_52 = silu_17 = None + hidden_states_160 = torch._C._nn.linear( + up_states_53, + l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_53 = l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_35 = torch.nn.functional.dropout(hidden_states_160, 0.0, False, False) + hidden_states_160 = None + hidden_states_161 = hidden_states_156 + dropout_35 + hidden_states_156 = dropout_35 = None + hidden_states_162 = hidden_states_161.to(torch.float32) + pow_37 = hidden_states_162.pow(2) + variance_36 = pow_37.mean(-1, keepdim=True) + pow_37 = None + add_108 = variance_36 + 1e-05 + variance_36 = None + rsqrt_36 = torch.rsqrt(add_108) + add_108 = None + hidden_states_163 = hidden_states_162 * rsqrt_36 + hidden_states_162 = rsqrt_36 = None + to_73 = hidden_states_163.to(torch.bfloat16) + hidden_states_163 = None + hidden_states_164 = ( + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + * to_73 + ) + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + to_73 + ) = None + qkv_18 = torch._C._nn.linear( + hidden_states_164, + l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_164 = l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_36 = qkv_18[(Ellipsis, slice(None, 3072, None))] + key_states_36 = qkv_18[(Ellipsis, slice(3072, 6144, None))] + value_states_36 = qkv_18[(Ellipsis, slice(6144, None, None))] + qkv_18 = None + view_54 = query_states_36.view((1, 2, -1, 96)) + query_states_36 = None + query_states_37 = view_54.transpose(1, 2) + view_54 = None + view_55 = key_states_36.view((1, 2, -1, 96)) + key_states_36 = None + key_states_37 = view_55.transpose(1, 2) + view_55 = None + view_56 = value_states_36.view((1, 2, -1, 96)) + value_states_36 = None + value_states_37 = view_56.transpose(1, 2) + view_56 = None + cos_18 = l_stack0_0_.unsqueeze(1) + sin_18 = l_stack0_1_.unsqueeze(1) + q_rot_18 = query_states_37[(Ellipsis, slice(None, 96, None))] + q_pass_18 = query_states_37[(Ellipsis, slice(96, None, None))] + query_states_37 = None + k_rot_18 = key_states_37[(Ellipsis, slice(None, 96, None))] + k_pass_18 = key_states_37[(Ellipsis, slice(96, None, None))] + key_states_37 = None + mul_164 = q_rot_18 * cos_18 + x1_36 = q_rot_18[(Ellipsis, slice(None, 48, None))] + x2_36 = q_rot_18[(Ellipsis, slice(48, None, None))] + q_rot_18 = None + neg_36 = -x2_36 + x2_36 = None + cat_72 = torch.cat((neg_36, x1_36), dim=-1) + neg_36 = x1_36 = None + mul_165 = cat_72 * sin_18 + cat_72 = None + add_109 = mul_164 + mul_165 + mul_164 = mul_165 = None + q_embed_18 = torch.cat([add_109, q_pass_18], dim=-1) + add_109 = q_pass_18 = None + mul_166 = k_rot_18 * cos_18 + cos_18 = None + x1_37 = k_rot_18[(Ellipsis, slice(None, 48, None))] + x2_37 = k_rot_18[(Ellipsis, slice(48, None, None))] + k_rot_18 = None + neg_37 = -x2_37 + x2_37 = None + cat_74 = torch.cat((neg_37, x1_37), dim=-1) + neg_37 = x1_37 = None + mul_167 = cat_74 * sin_18 + cat_74 = sin_18 = None + add_110 = mul_166 + mul_167 + mul_166 = mul_167 = None + k_embed_18 = torch.cat([add_110, k_pass_18], dim=-1) + add_110 = k_pass_18 = None + attention_mask_18 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_18 = q_embed_18.contiguous() + q_embed_18 = None + key_18 = k_embed_18.contiguous() + value_18 = value_states_37.contiguous() + attn_output_72 = torch._C._nn.scaled_dot_product_attention( + query_18, + key_18, + value_18, + attn_mask=attention_mask_18, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_18 = key_18 = value_18 = attention_mask_18 = None + transpose_75 = attn_output_72.transpose(1, 2) + attn_output_72 = None + attn_output_73 = transpose_75.contiguous() + transpose_75 = None + reshape_18 = attn_output_73.reshape(1, 2, -1) + attn_output_73 = None + attn_output_74 = reshape_18.contiguous() + reshape_18 = None + attn_output_75 = torch._C._nn.linear( + attn_output_74, + l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_74 = l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_36 = torch.nn.functional.dropout(attn_output_75, 0.0, False, False) + attn_output_75 = None + hidden_states_165 = hidden_states_161 + dropout_36 + hidden_states_161 = dropout_36 = None + hidden_states_166 = hidden_states_165.to(torch.float32) + pow_38 = hidden_states_166.pow(2) + variance_37 = pow_38.mean(-1, keepdim=True) + pow_38 = None + add_112 = variance_37 + 1e-05 + variance_37 = None + rsqrt_37 = torch.rsqrt(add_112) + add_112 = None + hidden_states_167 = hidden_states_166 * rsqrt_37 + hidden_states_166 = rsqrt_37 = None + to_75 = hidden_states_167.to(torch.bfloat16) + hidden_states_167 = None + hidden_states_168 = ( + l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + * to_75 + ) + l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = ( + to_75 + ) = None + up_states_54 = torch._C._nn.linear( + hidden_states_168, + l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_168 = l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_18 = up_states_54.chunk(2, dim=-1) + up_states_54 = None + gate_18 = chunk_18[0] + up_states_55 = chunk_18[1] + chunk_18 = None + silu_18 = torch.nn.functional.silu(gate_18, inplace=False) + gate_18 = None + up_states_56 = up_states_55 * silu_18 + up_states_55 = silu_18 = None + hidden_states_169 = torch._C._nn.linear( + up_states_56, + l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_56 = l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_37 = torch.nn.functional.dropout(hidden_states_169, 0.0, False, False) + hidden_states_169 = None + hidden_states_170 = hidden_states_165 + dropout_37 + hidden_states_165 = dropout_37 = None + hidden_states_171 = hidden_states_170.to(torch.float32) + pow_39 = hidden_states_171.pow(2) + variance_38 = pow_39.mean(-1, keepdim=True) + pow_39 = None + add_114 = variance_38 + 1e-05 + variance_38 = None + rsqrt_38 = torch.rsqrt(add_114) + add_114 = None + hidden_states_172 = hidden_states_171 * rsqrt_38 + hidden_states_171 = rsqrt_38 = None + to_77 = hidden_states_172.to(torch.bfloat16) + hidden_states_172 = None + hidden_states_173 = ( + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + * to_77 + ) + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + to_77 + ) = None + qkv_19 = torch._C._nn.linear( + hidden_states_173, + l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_173 = l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_38 = qkv_19[(Ellipsis, slice(None, 3072, None))] + key_states_38 = qkv_19[(Ellipsis, slice(3072, 6144, None))] + value_states_38 = qkv_19[(Ellipsis, slice(6144, None, None))] + qkv_19 = None + view_57 = query_states_38.view((1, 2, -1, 96)) + query_states_38 = None + query_states_39 = view_57.transpose(1, 2) + view_57 = None + view_58 = key_states_38.view((1, 2, -1, 96)) + key_states_38 = None + key_states_39 = view_58.transpose(1, 2) + view_58 = None + view_59 = value_states_38.view((1, 2, -1, 96)) + value_states_38 = None + value_states_39 = view_59.transpose(1, 2) + view_59 = None + cos_19 = l_stack0_0_.unsqueeze(1) + sin_19 = l_stack0_1_.unsqueeze(1) + q_rot_19 = query_states_39[(Ellipsis, slice(None, 96, None))] + q_pass_19 = query_states_39[(Ellipsis, slice(96, None, None))] + query_states_39 = None + k_rot_19 = key_states_39[(Ellipsis, slice(None, 96, None))] + k_pass_19 = key_states_39[(Ellipsis, slice(96, None, None))] + key_states_39 = None + mul_173 = q_rot_19 * cos_19 + x1_38 = q_rot_19[(Ellipsis, slice(None, 48, None))] + x2_38 = q_rot_19[(Ellipsis, slice(48, None, None))] + q_rot_19 = None + neg_38 = -x2_38 + x2_38 = None + cat_76 = torch.cat((neg_38, x1_38), dim=-1) + neg_38 = x1_38 = None + mul_174 = cat_76 * sin_19 + cat_76 = None + add_115 = mul_173 + mul_174 + mul_173 = mul_174 = None + q_embed_19 = torch.cat([add_115, q_pass_19], dim=-1) + add_115 = q_pass_19 = None + mul_175 = k_rot_19 * cos_19 + cos_19 = None + x1_39 = k_rot_19[(Ellipsis, slice(None, 48, None))] + x2_39 = k_rot_19[(Ellipsis, slice(48, None, None))] + k_rot_19 = None + neg_39 = -x2_39 + x2_39 = None + cat_78 = torch.cat((neg_39, x1_39), dim=-1) + neg_39 = x1_39 = None + mul_176 = cat_78 * sin_19 + cat_78 = sin_19 = None + add_116 = mul_175 + mul_176 + mul_175 = mul_176 = None + k_embed_19 = torch.cat([add_116, k_pass_19], dim=-1) + add_116 = k_pass_19 = None + attention_mask_19 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_19 = q_embed_19.contiguous() + q_embed_19 = None + key_19 = k_embed_19.contiguous() + value_19 = value_states_39.contiguous() + attn_output_76 = torch._C._nn.scaled_dot_product_attention( + query_19, + key_19, + value_19, + attn_mask=attention_mask_19, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_19 = key_19 = value_19 = attention_mask_19 = None + transpose_79 = attn_output_76.transpose(1, 2) + attn_output_76 = None + attn_output_77 = transpose_79.contiguous() + transpose_79 = None + reshape_19 = attn_output_77.reshape(1, 2, -1) + attn_output_77 = None + attn_output_78 = reshape_19.contiguous() + reshape_19 = None + attn_output_79 = torch._C._nn.linear( + attn_output_78, + l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_78 = l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_38 = torch.nn.functional.dropout(attn_output_79, 0.0, False, False) + attn_output_79 = None + hidden_states_174 = hidden_states_170 + dropout_38 + hidden_states_170 = dropout_38 = None + hidden_states_175 = hidden_states_174.to(torch.float32) + pow_40 = hidden_states_175.pow(2) + variance_39 = pow_40.mean(-1, keepdim=True) + pow_40 = None + add_118 = variance_39 + 1e-05 + variance_39 = None + rsqrt_39 = torch.rsqrt(add_118) + add_118 = None + hidden_states_176 = hidden_states_175 * rsqrt_39 + hidden_states_175 = rsqrt_39 = None + to_79 = hidden_states_176.to(torch.bfloat16) + hidden_states_176 = None + hidden_states_177 = ( + l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + * to_79 + ) + l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = ( + to_79 + ) = None + up_states_57 = torch._C._nn.linear( + hidden_states_177, + l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_177 = l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_19 = up_states_57.chunk(2, dim=-1) + up_states_57 = None + gate_19 = chunk_19[0] + up_states_58 = chunk_19[1] + chunk_19 = None + silu_19 = torch.nn.functional.silu(gate_19, inplace=False) + gate_19 = None + up_states_59 = up_states_58 * silu_19 + up_states_58 = silu_19 = None + hidden_states_178 = torch._C._nn.linear( + up_states_59, + l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_59 = l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_39 = torch.nn.functional.dropout(hidden_states_178, 0.0, False, False) + hidden_states_178 = None + hidden_states_179 = hidden_states_174 + dropout_39 + hidden_states_174 = dropout_39 = None + hidden_states_180 = hidden_states_179.to(torch.float32) + pow_41 = hidden_states_180.pow(2) + variance_40 = pow_41.mean(-1, keepdim=True) + pow_41 = None + add_120 = variance_40 + 1e-05 + variance_40 = None + rsqrt_40 = torch.rsqrt(add_120) + add_120 = None + hidden_states_181 = hidden_states_180 * rsqrt_40 + hidden_states_180 = rsqrt_40 = None + to_81 = hidden_states_181.to(torch.bfloat16) + hidden_states_181 = None + hidden_states_182 = ( + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + * to_81 + ) + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + to_81 + ) = None + qkv_20 = torch._C._nn.linear( + hidden_states_182, + l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_182 = l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_40 = qkv_20[(Ellipsis, slice(None, 3072, None))] + key_states_40 = qkv_20[(Ellipsis, slice(3072, 6144, None))] + value_states_40 = qkv_20[(Ellipsis, slice(6144, None, None))] + qkv_20 = None + view_60 = query_states_40.view((1, 2, -1, 96)) + query_states_40 = None + query_states_41 = view_60.transpose(1, 2) + view_60 = None + view_61 = key_states_40.view((1, 2, -1, 96)) + key_states_40 = None + key_states_41 = view_61.transpose(1, 2) + view_61 = None + view_62 = value_states_40.view((1, 2, -1, 96)) + value_states_40 = None + value_states_41 = view_62.transpose(1, 2) + view_62 = None + cos_20 = l_stack0_0_.unsqueeze(1) + sin_20 = l_stack0_1_.unsqueeze(1) + q_rot_20 = query_states_41[(Ellipsis, slice(None, 96, None))] + q_pass_20 = query_states_41[(Ellipsis, slice(96, None, None))] + query_states_41 = None + k_rot_20 = key_states_41[(Ellipsis, slice(None, 96, None))] + k_pass_20 = key_states_41[(Ellipsis, slice(96, None, None))] + key_states_41 = None + mul_182 = q_rot_20 * cos_20 + x1_40 = q_rot_20[(Ellipsis, slice(None, 48, None))] + x2_40 = q_rot_20[(Ellipsis, slice(48, None, None))] + q_rot_20 = None + neg_40 = -x2_40 + x2_40 = None + cat_80 = torch.cat((neg_40, x1_40), dim=-1) + neg_40 = x1_40 = None + mul_183 = cat_80 * sin_20 + cat_80 = None + add_121 = mul_182 + mul_183 + mul_182 = mul_183 = None + q_embed_20 = torch.cat([add_121, q_pass_20], dim=-1) + add_121 = q_pass_20 = None + mul_184 = k_rot_20 * cos_20 + cos_20 = None + x1_41 = k_rot_20[(Ellipsis, slice(None, 48, None))] + x2_41 = k_rot_20[(Ellipsis, slice(48, None, None))] + k_rot_20 = None + neg_41 = -x2_41 + x2_41 = None + cat_82 = torch.cat((neg_41, x1_41), dim=-1) + neg_41 = x1_41 = None + mul_185 = cat_82 * sin_20 + cat_82 = sin_20 = None + add_122 = mul_184 + mul_185 + mul_184 = mul_185 = None + k_embed_20 = torch.cat([add_122, k_pass_20], dim=-1) + add_122 = k_pass_20 = None + attention_mask_20 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_20 = q_embed_20.contiguous() + q_embed_20 = None + key_20 = k_embed_20.contiguous() + value_20 = value_states_41.contiguous() + attn_output_80 = torch._C._nn.scaled_dot_product_attention( + query_20, + key_20, + value_20, + attn_mask=attention_mask_20, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_20 = key_20 = value_20 = attention_mask_20 = None + transpose_83 = attn_output_80.transpose(1, 2) + attn_output_80 = None + attn_output_81 = transpose_83.contiguous() + transpose_83 = None + reshape_20 = attn_output_81.reshape(1, 2, -1) + attn_output_81 = None + attn_output_82 = reshape_20.contiguous() + reshape_20 = None + attn_output_83 = torch._C._nn.linear( + attn_output_82, + l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_82 = l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_40 = torch.nn.functional.dropout(attn_output_83, 0.0, False, False) + attn_output_83 = None + hidden_states_183 = hidden_states_179 + dropout_40 + hidden_states_179 = dropout_40 = None + hidden_states_184 = hidden_states_183.to(torch.float32) + pow_42 = hidden_states_184.pow(2) + variance_41 = pow_42.mean(-1, keepdim=True) + pow_42 = None + add_124 = variance_41 + 1e-05 + variance_41 = None + rsqrt_41 = torch.rsqrt(add_124) + add_124 = None + hidden_states_185 = hidden_states_184 * rsqrt_41 + hidden_states_184 = rsqrt_41 = None + to_83 = hidden_states_185.to(torch.bfloat16) + hidden_states_185 = None + hidden_states_186 = ( + l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + * to_83 + ) + l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = ( + to_83 + ) = None + up_states_60 = torch._C._nn.linear( + hidden_states_186, + l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_186 = l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_20 = up_states_60.chunk(2, dim=-1) + up_states_60 = None + gate_20 = chunk_20[0] + up_states_61 = chunk_20[1] + chunk_20 = None + silu_20 = torch.nn.functional.silu(gate_20, inplace=False) + gate_20 = None + up_states_62 = up_states_61 * silu_20 + up_states_61 = silu_20 = None + hidden_states_187 = torch._C._nn.linear( + up_states_62, + l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_62 = l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_41 = torch.nn.functional.dropout(hidden_states_187, 0.0, False, False) + hidden_states_187 = None + hidden_states_188 = hidden_states_183 + dropout_41 + hidden_states_183 = dropout_41 = None + hidden_states_189 = hidden_states_188.to(torch.float32) + pow_43 = hidden_states_189.pow(2) + variance_42 = pow_43.mean(-1, keepdim=True) + pow_43 = None + add_126 = variance_42 + 1e-05 + variance_42 = None + rsqrt_42 = torch.rsqrt(add_126) + add_126 = None + hidden_states_190 = hidden_states_189 * rsqrt_42 + hidden_states_189 = rsqrt_42 = None + to_85 = hidden_states_190.to(torch.bfloat16) + hidden_states_190 = None + hidden_states_191 = ( + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + * to_85 + ) + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + to_85 + ) = None + qkv_21 = torch._C._nn.linear( + hidden_states_191, + l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_191 = l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_42 = qkv_21[(Ellipsis, slice(None, 3072, None))] + key_states_42 = qkv_21[(Ellipsis, slice(3072, 6144, None))] + value_states_42 = qkv_21[(Ellipsis, slice(6144, None, None))] + qkv_21 = None + view_63 = query_states_42.view((1, 2, -1, 96)) + query_states_42 = None + query_states_43 = view_63.transpose(1, 2) + view_63 = None + view_64 = key_states_42.view((1, 2, -1, 96)) + key_states_42 = None + key_states_43 = view_64.transpose(1, 2) + view_64 = None + view_65 = value_states_42.view((1, 2, -1, 96)) + value_states_42 = None + value_states_43 = view_65.transpose(1, 2) + view_65 = None + cos_21 = l_stack0_0_.unsqueeze(1) + sin_21 = l_stack0_1_.unsqueeze(1) + q_rot_21 = query_states_43[(Ellipsis, slice(None, 96, None))] + q_pass_21 = query_states_43[(Ellipsis, slice(96, None, None))] + query_states_43 = None + k_rot_21 = key_states_43[(Ellipsis, slice(None, 96, None))] + k_pass_21 = key_states_43[(Ellipsis, slice(96, None, None))] + key_states_43 = None + mul_191 = q_rot_21 * cos_21 + x1_42 = q_rot_21[(Ellipsis, slice(None, 48, None))] + x2_42 = q_rot_21[(Ellipsis, slice(48, None, None))] + q_rot_21 = None + neg_42 = -x2_42 + x2_42 = None + cat_84 = torch.cat((neg_42, x1_42), dim=-1) + neg_42 = x1_42 = None + mul_192 = cat_84 * sin_21 + cat_84 = None + add_127 = mul_191 + mul_192 + mul_191 = mul_192 = None + q_embed_21 = torch.cat([add_127, q_pass_21], dim=-1) + add_127 = q_pass_21 = None + mul_193 = k_rot_21 * cos_21 + cos_21 = None + x1_43 = k_rot_21[(Ellipsis, slice(None, 48, None))] + x2_43 = k_rot_21[(Ellipsis, slice(48, None, None))] + k_rot_21 = None + neg_43 = -x2_43 + x2_43 = None + cat_86 = torch.cat((neg_43, x1_43), dim=-1) + neg_43 = x1_43 = None + mul_194 = cat_86 * sin_21 + cat_86 = sin_21 = None + add_128 = mul_193 + mul_194 + mul_193 = mul_194 = None + k_embed_21 = torch.cat([add_128, k_pass_21], dim=-1) + add_128 = k_pass_21 = None + attention_mask_21 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_21 = q_embed_21.contiguous() + q_embed_21 = None + key_21 = k_embed_21.contiguous() + value_21 = value_states_43.contiguous() + attn_output_84 = torch._C._nn.scaled_dot_product_attention( + query_21, + key_21, + value_21, + attn_mask=attention_mask_21, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_21 = key_21 = value_21 = attention_mask_21 = None + transpose_87 = attn_output_84.transpose(1, 2) + attn_output_84 = None + attn_output_85 = transpose_87.contiguous() + transpose_87 = None + reshape_21 = attn_output_85.reshape(1, 2, -1) + attn_output_85 = None + attn_output_86 = reshape_21.contiguous() + reshape_21 = None + attn_output_87 = torch._C._nn.linear( + attn_output_86, + l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_86 = l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_42 = torch.nn.functional.dropout(attn_output_87, 0.0, False, False) + attn_output_87 = None + hidden_states_192 = hidden_states_188 + dropout_42 + hidden_states_188 = dropout_42 = None + hidden_states_193 = hidden_states_192.to(torch.float32) + pow_44 = hidden_states_193.pow(2) + variance_43 = pow_44.mean(-1, keepdim=True) + pow_44 = None + add_130 = variance_43 + 1e-05 + variance_43 = None + rsqrt_43 = torch.rsqrt(add_130) + add_130 = None + hidden_states_194 = hidden_states_193 * rsqrt_43 + hidden_states_193 = rsqrt_43 = None + to_87 = hidden_states_194.to(torch.bfloat16) + hidden_states_194 = None + hidden_states_195 = ( + l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + * to_87 + ) + l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = ( + to_87 + ) = None + up_states_63 = torch._C._nn.linear( + hidden_states_195, + l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_195 = l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_21 = up_states_63.chunk(2, dim=-1) + up_states_63 = None + gate_21 = chunk_21[0] + up_states_64 = chunk_21[1] + chunk_21 = None + silu_21 = torch.nn.functional.silu(gate_21, inplace=False) + gate_21 = None + up_states_65 = up_states_64 * silu_21 + up_states_64 = silu_21 = None + hidden_states_196 = torch._C._nn.linear( + up_states_65, + l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_65 = l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_43 = torch.nn.functional.dropout(hidden_states_196, 0.0, False, False) + hidden_states_196 = None + hidden_states_197 = hidden_states_192 + dropout_43 + hidden_states_192 = dropout_43 = None + hidden_states_198 = hidden_states_197.to(torch.float32) + pow_45 = hidden_states_198.pow(2) + variance_44 = pow_45.mean(-1, keepdim=True) + pow_45 = None + add_132 = variance_44 + 1e-05 + variance_44 = None + rsqrt_44 = torch.rsqrt(add_132) + add_132 = None + hidden_states_199 = hidden_states_198 * rsqrt_44 + hidden_states_198 = rsqrt_44 = None + to_89 = hidden_states_199.to(torch.bfloat16) + hidden_states_199 = None + hidden_states_200 = ( + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + * to_89 + ) + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + to_89 + ) = None + qkv_22 = torch._C._nn.linear( + hidden_states_200, + l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_200 = l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_44 = qkv_22[(Ellipsis, slice(None, 3072, None))] + key_states_44 = qkv_22[(Ellipsis, slice(3072, 6144, None))] + value_states_44 = qkv_22[(Ellipsis, slice(6144, None, None))] + qkv_22 = None + view_66 = query_states_44.view((1, 2, -1, 96)) + query_states_44 = None + query_states_45 = view_66.transpose(1, 2) + view_66 = None + view_67 = key_states_44.view((1, 2, -1, 96)) + key_states_44 = None + key_states_45 = view_67.transpose(1, 2) + view_67 = None + view_68 = value_states_44.view((1, 2, -1, 96)) + value_states_44 = None + value_states_45 = view_68.transpose(1, 2) + view_68 = None + cos_22 = l_stack0_0_.unsqueeze(1) + sin_22 = l_stack0_1_.unsqueeze(1) + q_rot_22 = query_states_45[(Ellipsis, slice(None, 96, None))] + q_pass_22 = query_states_45[(Ellipsis, slice(96, None, None))] + query_states_45 = None + k_rot_22 = key_states_45[(Ellipsis, slice(None, 96, None))] + k_pass_22 = key_states_45[(Ellipsis, slice(96, None, None))] + key_states_45 = None + mul_200 = q_rot_22 * cos_22 + x1_44 = q_rot_22[(Ellipsis, slice(None, 48, None))] + x2_44 = q_rot_22[(Ellipsis, slice(48, None, None))] + q_rot_22 = None + neg_44 = -x2_44 + x2_44 = None + cat_88 = torch.cat((neg_44, x1_44), dim=-1) + neg_44 = x1_44 = None + mul_201 = cat_88 * sin_22 + cat_88 = None + add_133 = mul_200 + mul_201 + mul_200 = mul_201 = None + q_embed_22 = torch.cat([add_133, q_pass_22], dim=-1) + add_133 = q_pass_22 = None + mul_202 = k_rot_22 * cos_22 + cos_22 = None + x1_45 = k_rot_22[(Ellipsis, slice(None, 48, None))] + x2_45 = k_rot_22[(Ellipsis, slice(48, None, None))] + k_rot_22 = None + neg_45 = -x2_45 + x2_45 = None + cat_90 = torch.cat((neg_45, x1_45), dim=-1) + neg_45 = x1_45 = None + mul_203 = cat_90 * sin_22 + cat_90 = sin_22 = None + add_134 = mul_202 + mul_203 + mul_202 = mul_203 = None + k_embed_22 = torch.cat([add_134, k_pass_22], dim=-1) + add_134 = k_pass_22 = None + attention_mask_22 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_22 = q_embed_22.contiguous() + q_embed_22 = None + key_22 = k_embed_22.contiguous() + value_22 = value_states_45.contiguous() + attn_output_88 = torch._C._nn.scaled_dot_product_attention( + query_22, + key_22, + value_22, + attn_mask=attention_mask_22, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_22 = key_22 = value_22 = attention_mask_22 = None + transpose_91 = attn_output_88.transpose(1, 2) + attn_output_88 = None + attn_output_89 = transpose_91.contiguous() + transpose_91 = None + reshape_22 = attn_output_89.reshape(1, 2, -1) + attn_output_89 = None + attn_output_90 = reshape_22.contiguous() + reshape_22 = None + attn_output_91 = torch._C._nn.linear( + attn_output_90, + l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_90 = l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_44 = torch.nn.functional.dropout(attn_output_91, 0.0, False, False) + attn_output_91 = None + hidden_states_201 = hidden_states_197 + dropout_44 + hidden_states_197 = dropout_44 = None + hidden_states_202 = hidden_states_201.to(torch.float32) + pow_46 = hidden_states_202.pow(2) + variance_45 = pow_46.mean(-1, keepdim=True) + pow_46 = None + add_136 = variance_45 + 1e-05 + variance_45 = None + rsqrt_45 = torch.rsqrt(add_136) + add_136 = None + hidden_states_203 = hidden_states_202 * rsqrt_45 + hidden_states_202 = rsqrt_45 = None + to_91 = hidden_states_203.to(torch.bfloat16) + hidden_states_203 = None + hidden_states_204 = ( + l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ + * to_91 + ) + l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = ( + to_91 + ) = None + up_states_66 = torch._C._nn.linear( + hidden_states_204, + l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_204 = l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_22 = up_states_66.chunk(2, dim=-1) + up_states_66 = None + gate_22 = chunk_22[0] + up_states_67 = chunk_22[1] + chunk_22 = None + silu_22 = torch.nn.functional.silu(gate_22, inplace=False) + gate_22 = None + up_states_68 = up_states_67 * silu_22 + up_states_67 = silu_22 = None + hidden_states_205 = torch._C._nn.linear( + up_states_68, + l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_68 = l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_45 = torch.nn.functional.dropout(hidden_states_205, 0.0, False, False) + hidden_states_205 = None + hidden_states_206 = hidden_states_201 + dropout_45 + hidden_states_201 = dropout_45 = None + hidden_states_207 = hidden_states_206.to(torch.float32) + pow_47 = hidden_states_207.pow(2) + variance_46 = pow_47.mean(-1, keepdim=True) + pow_47 = None + add_138 = variance_46 + 1e-05 + variance_46 = None + rsqrt_46 = torch.rsqrt(add_138) + add_138 = None + hidden_states_208 = hidden_states_207 * rsqrt_46 + hidden_states_207 = rsqrt_46 = None + to_93 = hidden_states_208.to(torch.bfloat16) + hidden_states_208 = None + hidden_states_209 = ( + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + * to_93 + ) + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + to_93 + ) = None + qkv_23 = torch._C._nn.linear( + hidden_states_209, + l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_209 = l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_46 = qkv_23[(Ellipsis, slice(None, 3072, None))] + key_states_46 = qkv_23[(Ellipsis, slice(3072, 6144, None))] + value_states_46 = qkv_23[(Ellipsis, slice(6144, None, None))] + qkv_23 = None + view_69 = query_states_46.view((1, 2, -1, 96)) + query_states_46 = None + query_states_47 = view_69.transpose(1, 2) + view_69 = None + view_70 = key_states_46.view((1, 2, -1, 96)) + key_states_46 = None + key_states_47 = view_70.transpose(1, 2) + view_70 = None + view_71 = value_states_46.view((1, 2, -1, 96)) + value_states_46 = None + value_states_47 = view_71.transpose(1, 2) + view_71 = None + cos_23 = l_stack0_0_.unsqueeze(1) + sin_23 = l_stack0_1_.unsqueeze(1) + q_rot_23 = query_states_47[(Ellipsis, slice(None, 96, None))] + q_pass_23 = query_states_47[(Ellipsis, slice(96, None, None))] + query_states_47 = None + k_rot_23 = key_states_47[(Ellipsis, slice(None, 96, None))] + k_pass_23 = key_states_47[(Ellipsis, slice(96, None, None))] + key_states_47 = None + mul_209 = q_rot_23 * cos_23 + x1_46 = q_rot_23[(Ellipsis, slice(None, 48, None))] + x2_46 = q_rot_23[(Ellipsis, slice(48, None, None))] + q_rot_23 = None + neg_46 = -x2_46 + x2_46 = None + cat_92 = torch.cat((neg_46, x1_46), dim=-1) + neg_46 = x1_46 = None + mul_210 = cat_92 * sin_23 + cat_92 = None + add_139 = mul_209 + mul_210 + mul_209 = mul_210 = None + q_embed_23 = torch.cat([add_139, q_pass_23], dim=-1) + add_139 = q_pass_23 = None + mul_211 = k_rot_23 * cos_23 + cos_23 = None + x1_47 = k_rot_23[(Ellipsis, slice(None, 48, None))] + x2_47 = k_rot_23[(Ellipsis, slice(48, None, None))] + k_rot_23 = None + neg_47 = -x2_47 + x2_47 = None + cat_94 = torch.cat((neg_47, x1_47), dim=-1) + neg_47 = x1_47 = None + mul_212 = cat_94 * sin_23 + cat_94 = sin_23 = None + add_140 = mul_211 + mul_212 + mul_211 = mul_212 = None + k_embed_23 = torch.cat([add_140, k_pass_23], dim=-1) + add_140 = k_pass_23 = None + attention_mask_23 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_23 = q_embed_23.contiguous() + q_embed_23 = None + key_23 = k_embed_23.contiguous() + value_23 = value_states_47.contiguous() + attn_output_92 = torch._C._nn.scaled_dot_product_attention( + query_23, + key_23, + value_23, + attn_mask=attention_mask_23, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_23 = key_23 = value_23 = attention_mask_23 = None + transpose_95 = attn_output_92.transpose(1, 2) + attn_output_92 = None + attn_output_93 = transpose_95.contiguous() + transpose_95 = None + reshape_23 = attn_output_93.reshape(1, 2, -1) + attn_output_93 = None + attn_output_94 = reshape_23.contiguous() + reshape_23 = None + attn_output_95 = torch._C._nn.linear( + attn_output_94, + l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_94 = l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_46 = torch.nn.functional.dropout(attn_output_95, 0.0, False, False) + attn_output_95 = None + hidden_states_210 = hidden_states_206 + dropout_46 + hidden_states_206 = dropout_46 = None + hidden_states_211 = hidden_states_210.to(torch.float32) + pow_48 = hidden_states_211.pow(2) + variance_47 = pow_48.mean(-1, keepdim=True) + pow_48 = None + add_142 = variance_47 + 1e-05 + variance_47 = None + rsqrt_47 = torch.rsqrt(add_142) + add_142 = None + hidden_states_212 = hidden_states_211 * rsqrt_47 + hidden_states_211 = rsqrt_47 = None + to_95 = hidden_states_212.to(torch.bfloat16) + hidden_states_212 = None + hidden_states_213 = ( + l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ + * to_95 + ) + l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = ( + to_95 + ) = None + up_states_69 = torch._C._nn.linear( + hidden_states_213, + l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_213 = l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_23 = up_states_69.chunk(2, dim=-1) + up_states_69 = None + gate_23 = chunk_23[0] + up_states_70 = chunk_23[1] + chunk_23 = None + silu_23 = torch.nn.functional.silu(gate_23, inplace=False) + gate_23 = None + up_states_71 = up_states_70 * silu_23 + up_states_70 = silu_23 = None + hidden_states_214 = torch._C._nn.linear( + up_states_71, + l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_71 = l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_47 = torch.nn.functional.dropout(hidden_states_214, 0.0, False, False) + hidden_states_214 = None + hidden_states_215 = hidden_states_210 + dropout_47 + hidden_states_210 = dropout_47 = None + hidden_states_216 = hidden_states_215.to(torch.float32) + pow_49 = hidden_states_216.pow(2) + variance_48 = pow_49.mean(-1, keepdim=True) + pow_49 = None + add_144 = variance_48 + 1e-05 + variance_48 = None + rsqrt_48 = torch.rsqrt(add_144) + add_144 = None + hidden_states_217 = hidden_states_216 * rsqrt_48 + hidden_states_216 = rsqrt_48 = None + to_97 = hidden_states_217.to(torch.bfloat16) + hidden_states_217 = None + hidden_states_218 = ( + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ + * to_97 + ) + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ = ( + to_97 + ) = None + qkv_24 = torch._C._nn.linear( + hidden_states_218, + l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_218 = l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_48 = qkv_24[(Ellipsis, slice(None, 3072, None))] + key_states_48 = qkv_24[(Ellipsis, slice(3072, 6144, None))] + value_states_48 = qkv_24[(Ellipsis, slice(6144, None, None))] + qkv_24 = None + view_72 = query_states_48.view((1, 2, -1, 96)) + query_states_48 = None + query_states_49 = view_72.transpose(1, 2) + view_72 = None + view_73 = key_states_48.view((1, 2, -1, 96)) + key_states_48 = None + key_states_49 = view_73.transpose(1, 2) + view_73 = None + view_74 = value_states_48.view((1, 2, -1, 96)) + value_states_48 = None + value_states_49 = view_74.transpose(1, 2) + view_74 = None + cos_24 = l_stack0_0_.unsqueeze(1) + sin_24 = l_stack0_1_.unsqueeze(1) + q_rot_24 = query_states_49[(Ellipsis, slice(None, 96, None))] + q_pass_24 = query_states_49[(Ellipsis, slice(96, None, None))] + query_states_49 = None + k_rot_24 = key_states_49[(Ellipsis, slice(None, 96, None))] + k_pass_24 = key_states_49[(Ellipsis, slice(96, None, None))] + key_states_49 = None + mul_218 = q_rot_24 * cos_24 + x1_48 = q_rot_24[(Ellipsis, slice(None, 48, None))] + x2_48 = q_rot_24[(Ellipsis, slice(48, None, None))] + q_rot_24 = None + neg_48 = -x2_48 + x2_48 = None + cat_96 = torch.cat((neg_48, x1_48), dim=-1) + neg_48 = x1_48 = None + mul_219 = cat_96 * sin_24 + cat_96 = None + add_145 = mul_218 + mul_219 + mul_218 = mul_219 = None + q_embed_24 = torch.cat([add_145, q_pass_24], dim=-1) + add_145 = q_pass_24 = None + mul_220 = k_rot_24 * cos_24 + cos_24 = None + x1_49 = k_rot_24[(Ellipsis, slice(None, 48, None))] + x2_49 = k_rot_24[(Ellipsis, slice(48, None, None))] + k_rot_24 = None + neg_49 = -x2_49 + x2_49 = None + cat_98 = torch.cat((neg_49, x1_49), dim=-1) + neg_49 = x1_49 = None + mul_221 = cat_98 * sin_24 + cat_98 = sin_24 = None + add_146 = mul_220 + mul_221 + mul_220 = mul_221 = None + k_embed_24 = torch.cat([add_146, k_pass_24], dim=-1) + add_146 = k_pass_24 = None + attention_mask_24 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_24 = q_embed_24.contiguous() + q_embed_24 = None + key_24 = k_embed_24.contiguous() + value_24 = value_states_49.contiguous() + attn_output_96 = torch._C._nn.scaled_dot_product_attention( + query_24, + key_24, + value_24, + attn_mask=attention_mask_24, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_24 = key_24 = value_24 = attention_mask_24 = None + transpose_99 = attn_output_96.transpose(1, 2) + attn_output_96 = None + attn_output_97 = transpose_99.contiguous() + transpose_99 = None + reshape_24 = attn_output_97.reshape(1, 2, -1) + attn_output_97 = None + attn_output_98 = reshape_24.contiguous() + reshape_24 = None + attn_output_99 = torch._C._nn.linear( + attn_output_98, + l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_98 = l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_48 = torch.nn.functional.dropout(attn_output_99, 0.0, False, False) + attn_output_99 = None + hidden_states_219 = hidden_states_215 + dropout_48 + hidden_states_215 = dropout_48 = None + hidden_states_220 = hidden_states_219.to(torch.float32) + pow_50 = hidden_states_220.pow(2) + variance_49 = pow_50.mean(-1, keepdim=True) + pow_50 = None + add_148 = variance_49 + 1e-05 + variance_49 = None + rsqrt_49 = torch.rsqrt(add_148) + add_148 = None + hidden_states_221 = hidden_states_220 * rsqrt_49 + hidden_states_220 = rsqrt_49 = None + to_99 = hidden_states_221.to(torch.bfloat16) + hidden_states_221 = None + hidden_states_222 = ( + l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ + * to_99 + ) + l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ = ( + to_99 + ) = None + up_states_72 = torch._C._nn.linear( + hidden_states_222, + l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_222 = l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_24 = up_states_72.chunk(2, dim=-1) + up_states_72 = None + gate_24 = chunk_24[0] + up_states_73 = chunk_24[1] + chunk_24 = None + silu_24 = torch.nn.functional.silu(gate_24, inplace=False) + gate_24 = None + up_states_74 = up_states_73 * silu_24 + up_states_73 = silu_24 = None + hidden_states_223 = torch._C._nn.linear( + up_states_74, + l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_74 = l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_49 = torch.nn.functional.dropout(hidden_states_223, 0.0, False, False) + hidden_states_223 = None + hidden_states_224 = hidden_states_219 + dropout_49 + hidden_states_219 = dropout_49 = None + hidden_states_225 = hidden_states_224.to(torch.float32) + pow_51 = hidden_states_225.pow(2) + variance_50 = pow_51.mean(-1, keepdim=True) + pow_51 = None + add_150 = variance_50 + 1e-05 + variance_50 = None + rsqrt_50 = torch.rsqrt(add_150) + add_150 = None + hidden_states_226 = hidden_states_225 * rsqrt_50 + hidden_states_225 = rsqrt_50 = None + to_101 = hidden_states_226.to(torch.bfloat16) + hidden_states_226 = None + hidden_states_227 = ( + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ + * to_101 + ) + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ = ( + to_101 + ) = None + qkv_25 = torch._C._nn.linear( + hidden_states_227, + l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_227 = l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_50 = qkv_25[(Ellipsis, slice(None, 3072, None))] + key_states_50 = qkv_25[(Ellipsis, slice(3072, 6144, None))] + value_states_50 = qkv_25[(Ellipsis, slice(6144, None, None))] + qkv_25 = None + view_75 = query_states_50.view((1, 2, -1, 96)) + query_states_50 = None + query_states_51 = view_75.transpose(1, 2) + view_75 = None + view_76 = key_states_50.view((1, 2, -1, 96)) + key_states_50 = None + key_states_51 = view_76.transpose(1, 2) + view_76 = None + view_77 = value_states_50.view((1, 2, -1, 96)) + value_states_50 = None + value_states_51 = view_77.transpose(1, 2) + view_77 = None + cos_25 = l_stack0_0_.unsqueeze(1) + sin_25 = l_stack0_1_.unsqueeze(1) + q_rot_25 = query_states_51[(Ellipsis, slice(None, 96, None))] + q_pass_25 = query_states_51[(Ellipsis, slice(96, None, None))] + query_states_51 = None + k_rot_25 = key_states_51[(Ellipsis, slice(None, 96, None))] + k_pass_25 = key_states_51[(Ellipsis, slice(96, None, None))] + key_states_51 = None + mul_227 = q_rot_25 * cos_25 + x1_50 = q_rot_25[(Ellipsis, slice(None, 48, None))] + x2_50 = q_rot_25[(Ellipsis, slice(48, None, None))] + q_rot_25 = None + neg_50 = -x2_50 + x2_50 = None + cat_100 = torch.cat((neg_50, x1_50), dim=-1) + neg_50 = x1_50 = None + mul_228 = cat_100 * sin_25 + cat_100 = None + add_151 = mul_227 + mul_228 + mul_227 = mul_228 = None + q_embed_25 = torch.cat([add_151, q_pass_25], dim=-1) + add_151 = q_pass_25 = None + mul_229 = k_rot_25 * cos_25 + cos_25 = None + x1_51 = k_rot_25[(Ellipsis, slice(None, 48, None))] + x2_51 = k_rot_25[(Ellipsis, slice(48, None, None))] + k_rot_25 = None + neg_51 = -x2_51 + x2_51 = None + cat_102 = torch.cat((neg_51, x1_51), dim=-1) + neg_51 = x1_51 = None + mul_230 = cat_102 * sin_25 + cat_102 = sin_25 = None + add_152 = mul_229 + mul_230 + mul_229 = mul_230 = None + k_embed_25 = torch.cat([add_152, k_pass_25], dim=-1) + add_152 = k_pass_25 = None + attention_mask_25 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_25 = q_embed_25.contiguous() + q_embed_25 = None + key_25 = k_embed_25.contiguous() + value_25 = value_states_51.contiguous() + attn_output_100 = torch._C._nn.scaled_dot_product_attention( + query_25, + key_25, + value_25, + attn_mask=attention_mask_25, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_25 = key_25 = value_25 = attention_mask_25 = None + transpose_103 = attn_output_100.transpose(1, 2) + attn_output_100 = None + attn_output_101 = transpose_103.contiguous() + transpose_103 = None + reshape_25 = attn_output_101.reshape(1, 2, -1) + attn_output_101 = None + attn_output_102 = reshape_25.contiguous() + reshape_25 = None + attn_output_103 = torch._C._nn.linear( + attn_output_102, + l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_102 = l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_50 = torch.nn.functional.dropout(attn_output_103, 0.0, False, False) + attn_output_103 = None + hidden_states_228 = hidden_states_224 + dropout_50 + hidden_states_224 = dropout_50 = None + hidden_states_229 = hidden_states_228.to(torch.float32) + pow_52 = hidden_states_229.pow(2) + variance_51 = pow_52.mean(-1, keepdim=True) + pow_52 = None + add_154 = variance_51 + 1e-05 + variance_51 = None + rsqrt_51 = torch.rsqrt(add_154) + add_154 = None + hidden_states_230 = hidden_states_229 * rsqrt_51 + hidden_states_229 = rsqrt_51 = None + to_103 = hidden_states_230.to(torch.bfloat16) + hidden_states_230 = None + hidden_states_231 = ( + l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ + * to_103 + ) + l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ = ( + to_103 + ) = None + up_states_75 = torch._C._nn.linear( + hidden_states_231, + l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_231 = l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_25 = up_states_75.chunk(2, dim=-1) + up_states_75 = None + gate_25 = chunk_25[0] + up_states_76 = chunk_25[1] + chunk_25 = None + silu_25 = torch.nn.functional.silu(gate_25, inplace=False) + gate_25 = None + up_states_77 = up_states_76 * silu_25 + up_states_76 = silu_25 = None + hidden_states_232 = torch._C._nn.linear( + up_states_77, + l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_77 = l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_51 = torch.nn.functional.dropout(hidden_states_232, 0.0, False, False) + hidden_states_232 = None + hidden_states_233 = hidden_states_228 + dropout_51 + hidden_states_228 = dropout_51 = None + hidden_states_234 = hidden_states_233.to(torch.float32) + pow_53 = hidden_states_234.pow(2) + variance_52 = pow_53.mean(-1, keepdim=True) + pow_53 = None + add_156 = variance_52 + 1e-05 + variance_52 = None + rsqrt_52 = torch.rsqrt(add_156) + add_156 = None + hidden_states_235 = hidden_states_234 * rsqrt_52 + hidden_states_234 = rsqrt_52 = None + to_105 = hidden_states_235.to(torch.bfloat16) + hidden_states_235 = None + hidden_states_236 = ( + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ + * to_105 + ) + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ = ( + to_105 + ) = None + qkv_26 = torch._C._nn.linear( + hidden_states_236, + l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_236 = l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_52 = qkv_26[(Ellipsis, slice(None, 3072, None))] + key_states_52 = qkv_26[(Ellipsis, slice(3072, 6144, None))] + value_states_52 = qkv_26[(Ellipsis, slice(6144, None, None))] + qkv_26 = None + view_78 = query_states_52.view((1, 2, -1, 96)) + query_states_52 = None + query_states_53 = view_78.transpose(1, 2) + view_78 = None + view_79 = key_states_52.view((1, 2, -1, 96)) + key_states_52 = None + key_states_53 = view_79.transpose(1, 2) + view_79 = None + view_80 = value_states_52.view((1, 2, -1, 96)) + value_states_52 = None + value_states_53 = view_80.transpose(1, 2) + view_80 = None + cos_26 = l_stack0_0_.unsqueeze(1) + sin_26 = l_stack0_1_.unsqueeze(1) + q_rot_26 = query_states_53[(Ellipsis, slice(None, 96, None))] + q_pass_26 = query_states_53[(Ellipsis, slice(96, None, None))] + query_states_53 = None + k_rot_26 = key_states_53[(Ellipsis, slice(None, 96, None))] + k_pass_26 = key_states_53[(Ellipsis, slice(96, None, None))] + key_states_53 = None + mul_236 = q_rot_26 * cos_26 + x1_52 = q_rot_26[(Ellipsis, slice(None, 48, None))] + x2_52 = q_rot_26[(Ellipsis, slice(48, None, None))] + q_rot_26 = None + neg_52 = -x2_52 + x2_52 = None + cat_104 = torch.cat((neg_52, x1_52), dim=-1) + neg_52 = x1_52 = None + mul_237 = cat_104 * sin_26 + cat_104 = None + add_157 = mul_236 + mul_237 + mul_236 = mul_237 = None + q_embed_26 = torch.cat([add_157, q_pass_26], dim=-1) + add_157 = q_pass_26 = None + mul_238 = k_rot_26 * cos_26 + cos_26 = None + x1_53 = k_rot_26[(Ellipsis, slice(None, 48, None))] + x2_53 = k_rot_26[(Ellipsis, slice(48, None, None))] + k_rot_26 = None + neg_53 = -x2_53 + x2_53 = None + cat_106 = torch.cat((neg_53, x1_53), dim=-1) + neg_53 = x1_53 = None + mul_239 = cat_106 * sin_26 + cat_106 = sin_26 = None + add_158 = mul_238 + mul_239 + mul_238 = mul_239 = None + k_embed_26 = torch.cat([add_158, k_pass_26], dim=-1) + add_158 = k_pass_26 = None + attention_mask_26 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_26 = q_embed_26.contiguous() + q_embed_26 = None + key_26 = k_embed_26.contiguous() + value_26 = value_states_53.contiguous() + attn_output_104 = torch._C._nn.scaled_dot_product_attention( + query_26, + key_26, + value_26, + attn_mask=attention_mask_26, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_26 = key_26 = value_26 = attention_mask_26 = None + transpose_107 = attn_output_104.transpose(1, 2) + attn_output_104 = None + attn_output_105 = transpose_107.contiguous() + transpose_107 = None + reshape_26 = attn_output_105.reshape(1, 2, -1) + attn_output_105 = None + attn_output_106 = reshape_26.contiguous() + reshape_26 = None + attn_output_107 = torch._C._nn.linear( + attn_output_106, + l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_106 = l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_52 = torch.nn.functional.dropout(attn_output_107, 0.0, False, False) + attn_output_107 = None + hidden_states_237 = hidden_states_233 + dropout_52 + hidden_states_233 = dropout_52 = None + hidden_states_238 = hidden_states_237.to(torch.float32) + pow_54 = hidden_states_238.pow(2) + variance_53 = pow_54.mean(-1, keepdim=True) + pow_54 = None + add_160 = variance_53 + 1e-05 + variance_53 = None + rsqrt_53 = torch.rsqrt(add_160) + add_160 = None + hidden_states_239 = hidden_states_238 * rsqrt_53 + hidden_states_238 = rsqrt_53 = None + to_107 = hidden_states_239.to(torch.bfloat16) + hidden_states_239 = None + hidden_states_240 = ( + l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ + * to_107 + ) + l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ = ( + to_107 + ) = None + up_states_78 = torch._C._nn.linear( + hidden_states_240, + l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_240 = l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_26 = up_states_78.chunk(2, dim=-1) + up_states_78 = None + gate_26 = chunk_26[0] + up_states_79 = chunk_26[1] + chunk_26 = None + silu_26 = torch.nn.functional.silu(gate_26, inplace=False) + gate_26 = None + up_states_80 = up_states_79 * silu_26 + up_states_79 = silu_26 = None + hidden_states_241 = torch._C._nn.linear( + up_states_80, + l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_80 = l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_53 = torch.nn.functional.dropout(hidden_states_241, 0.0, False, False) + hidden_states_241 = None + hidden_states_242 = hidden_states_237 + dropout_53 + hidden_states_237 = dropout_53 = None + hidden_states_243 = hidden_states_242.to(torch.float32) + pow_55 = hidden_states_243.pow(2) + variance_54 = pow_55.mean(-1, keepdim=True) + pow_55 = None + add_162 = variance_54 + 1e-05 + variance_54 = None + rsqrt_54 = torch.rsqrt(add_162) + add_162 = None + hidden_states_244 = hidden_states_243 * rsqrt_54 + hidden_states_243 = rsqrt_54 = None + to_109 = hidden_states_244.to(torch.bfloat16) + hidden_states_244 = None + hidden_states_245 = ( + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ + * to_109 + ) + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ = ( + to_109 + ) = None + qkv_27 = torch._C._nn.linear( + hidden_states_245, + l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_245 = l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_54 = qkv_27[(Ellipsis, slice(None, 3072, None))] + key_states_54 = qkv_27[(Ellipsis, slice(3072, 6144, None))] + value_states_54 = qkv_27[(Ellipsis, slice(6144, None, None))] + qkv_27 = None + view_81 = query_states_54.view((1, 2, -1, 96)) + query_states_54 = None + query_states_55 = view_81.transpose(1, 2) + view_81 = None + view_82 = key_states_54.view((1, 2, -1, 96)) + key_states_54 = None + key_states_55 = view_82.transpose(1, 2) + view_82 = None + view_83 = value_states_54.view((1, 2, -1, 96)) + value_states_54 = None + value_states_55 = view_83.transpose(1, 2) + view_83 = None + cos_27 = l_stack0_0_.unsqueeze(1) + sin_27 = l_stack0_1_.unsqueeze(1) + q_rot_27 = query_states_55[(Ellipsis, slice(None, 96, None))] + q_pass_27 = query_states_55[(Ellipsis, slice(96, None, None))] + query_states_55 = None + k_rot_27 = key_states_55[(Ellipsis, slice(None, 96, None))] + k_pass_27 = key_states_55[(Ellipsis, slice(96, None, None))] + key_states_55 = None + mul_245 = q_rot_27 * cos_27 + x1_54 = q_rot_27[(Ellipsis, slice(None, 48, None))] + x2_54 = q_rot_27[(Ellipsis, slice(48, None, None))] + q_rot_27 = None + neg_54 = -x2_54 + x2_54 = None + cat_108 = torch.cat((neg_54, x1_54), dim=-1) + neg_54 = x1_54 = None + mul_246 = cat_108 * sin_27 + cat_108 = None + add_163 = mul_245 + mul_246 + mul_245 = mul_246 = None + q_embed_27 = torch.cat([add_163, q_pass_27], dim=-1) + add_163 = q_pass_27 = None + mul_247 = k_rot_27 * cos_27 + cos_27 = None + x1_55 = k_rot_27[(Ellipsis, slice(None, 48, None))] + x2_55 = k_rot_27[(Ellipsis, slice(48, None, None))] + k_rot_27 = None + neg_55 = -x2_55 + x2_55 = None + cat_110 = torch.cat((neg_55, x1_55), dim=-1) + neg_55 = x1_55 = None + mul_248 = cat_110 * sin_27 + cat_110 = sin_27 = None + add_164 = mul_247 + mul_248 + mul_247 = mul_248 = None + k_embed_27 = torch.cat([add_164, k_pass_27], dim=-1) + add_164 = k_pass_27 = None + attention_mask_27 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_27 = q_embed_27.contiguous() + q_embed_27 = None + key_27 = k_embed_27.contiguous() + value_27 = value_states_55.contiguous() + attn_output_108 = torch._C._nn.scaled_dot_product_attention( + query_27, + key_27, + value_27, + attn_mask=attention_mask_27, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_27 = key_27 = value_27 = attention_mask_27 = None + transpose_111 = attn_output_108.transpose(1, 2) + attn_output_108 = None + attn_output_109 = transpose_111.contiguous() + transpose_111 = None + reshape_27 = attn_output_109.reshape(1, 2, -1) + attn_output_109 = None + attn_output_110 = reshape_27.contiguous() + reshape_27 = None + attn_output_111 = torch._C._nn.linear( + attn_output_110, + l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_110 = l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_54 = torch.nn.functional.dropout(attn_output_111, 0.0, False, False) + attn_output_111 = None + hidden_states_246 = hidden_states_242 + dropout_54 + hidden_states_242 = dropout_54 = None + hidden_states_247 = hidden_states_246.to(torch.float32) + pow_56 = hidden_states_247.pow(2) + variance_55 = pow_56.mean(-1, keepdim=True) + pow_56 = None + add_166 = variance_55 + 1e-05 + variance_55 = None + rsqrt_55 = torch.rsqrt(add_166) + add_166 = None + hidden_states_248 = hidden_states_247 * rsqrt_55 + hidden_states_247 = rsqrt_55 = None + to_111 = hidden_states_248.to(torch.bfloat16) + hidden_states_248 = None + hidden_states_249 = ( + l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ + * to_111 + ) + l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ = ( + to_111 + ) = None + up_states_81 = torch._C._nn.linear( + hidden_states_249, + l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_249 = l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_27 = up_states_81.chunk(2, dim=-1) + up_states_81 = None + gate_27 = chunk_27[0] + up_states_82 = chunk_27[1] + chunk_27 = None + silu_27 = torch.nn.functional.silu(gate_27, inplace=False) + gate_27 = None + up_states_83 = up_states_82 * silu_27 + up_states_82 = silu_27 = None + hidden_states_250 = torch._C._nn.linear( + up_states_83, + l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_83 = l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_55 = torch.nn.functional.dropout(hidden_states_250, 0.0, False, False) + hidden_states_250 = None + hidden_states_251 = hidden_states_246 + dropout_55 + hidden_states_246 = dropout_55 = None + hidden_states_252 = hidden_states_251.to(torch.float32) + pow_57 = hidden_states_252.pow(2) + variance_56 = pow_57.mean(-1, keepdim=True) + pow_57 = None + add_168 = variance_56 + 1e-05 + variance_56 = None + rsqrt_56 = torch.rsqrt(add_168) + add_168 = None + hidden_states_253 = hidden_states_252 * rsqrt_56 + hidden_states_252 = rsqrt_56 = None + to_113 = hidden_states_253.to(torch.bfloat16) + hidden_states_253 = None + hidden_states_254 = ( + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ + * to_113 + ) + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ = ( + to_113 + ) = None + qkv_28 = torch._C._nn.linear( + hidden_states_254, + l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_254 = l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_56 = qkv_28[(Ellipsis, slice(None, 3072, None))] + key_states_56 = qkv_28[(Ellipsis, slice(3072, 6144, None))] + value_states_56 = qkv_28[(Ellipsis, slice(6144, None, None))] + qkv_28 = None + view_84 = query_states_56.view((1, 2, -1, 96)) + query_states_56 = None + query_states_57 = view_84.transpose(1, 2) + view_84 = None + view_85 = key_states_56.view((1, 2, -1, 96)) + key_states_56 = None + key_states_57 = view_85.transpose(1, 2) + view_85 = None + view_86 = value_states_56.view((1, 2, -1, 96)) + value_states_56 = None + value_states_57 = view_86.transpose(1, 2) + view_86 = None + cos_28 = l_stack0_0_.unsqueeze(1) + sin_28 = l_stack0_1_.unsqueeze(1) + q_rot_28 = query_states_57[(Ellipsis, slice(None, 96, None))] + q_pass_28 = query_states_57[(Ellipsis, slice(96, None, None))] + query_states_57 = None + k_rot_28 = key_states_57[(Ellipsis, slice(None, 96, None))] + k_pass_28 = key_states_57[(Ellipsis, slice(96, None, None))] + key_states_57 = None + mul_254 = q_rot_28 * cos_28 + x1_56 = q_rot_28[(Ellipsis, slice(None, 48, None))] + x2_56 = q_rot_28[(Ellipsis, slice(48, None, None))] + q_rot_28 = None + neg_56 = -x2_56 + x2_56 = None + cat_112 = torch.cat((neg_56, x1_56), dim=-1) + neg_56 = x1_56 = None + mul_255 = cat_112 * sin_28 + cat_112 = None + add_169 = mul_254 + mul_255 + mul_254 = mul_255 = None + q_embed_28 = torch.cat([add_169, q_pass_28], dim=-1) + add_169 = q_pass_28 = None + mul_256 = k_rot_28 * cos_28 + cos_28 = None + x1_57 = k_rot_28[(Ellipsis, slice(None, 48, None))] + x2_57 = k_rot_28[(Ellipsis, slice(48, None, None))] + k_rot_28 = None + neg_57 = -x2_57 + x2_57 = None + cat_114 = torch.cat((neg_57, x1_57), dim=-1) + neg_57 = x1_57 = None + mul_257 = cat_114 * sin_28 + cat_114 = sin_28 = None + add_170 = mul_256 + mul_257 + mul_256 = mul_257 = None + k_embed_28 = torch.cat([add_170, k_pass_28], dim=-1) + add_170 = k_pass_28 = None + attention_mask_28 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_28 = q_embed_28.contiguous() + q_embed_28 = None + key_28 = k_embed_28.contiguous() + value_28 = value_states_57.contiguous() + attn_output_112 = torch._C._nn.scaled_dot_product_attention( + query_28, + key_28, + value_28, + attn_mask=attention_mask_28, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_28 = key_28 = value_28 = attention_mask_28 = None + transpose_115 = attn_output_112.transpose(1, 2) + attn_output_112 = None + attn_output_113 = transpose_115.contiguous() + transpose_115 = None + reshape_28 = attn_output_113.reshape(1, 2, -1) + attn_output_113 = None + attn_output_114 = reshape_28.contiguous() + reshape_28 = None + attn_output_115 = torch._C._nn.linear( + attn_output_114, + l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_114 = l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_56 = torch.nn.functional.dropout(attn_output_115, 0.0, False, False) + attn_output_115 = None + hidden_states_255 = hidden_states_251 + dropout_56 + hidden_states_251 = dropout_56 = None + hidden_states_256 = hidden_states_255.to(torch.float32) + pow_58 = hidden_states_256.pow(2) + variance_57 = pow_58.mean(-1, keepdim=True) + pow_58 = None + add_172 = variance_57 + 1e-05 + variance_57 = None + rsqrt_57 = torch.rsqrt(add_172) + add_172 = None + hidden_states_257 = hidden_states_256 * rsqrt_57 + hidden_states_256 = rsqrt_57 = None + to_115 = hidden_states_257.to(torch.bfloat16) + hidden_states_257 = None + hidden_states_258 = ( + l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ + * to_115 + ) + l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ = ( + to_115 + ) = None + up_states_84 = torch._C._nn.linear( + hidden_states_258, + l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_258 = l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_28 = up_states_84.chunk(2, dim=-1) + up_states_84 = None + gate_28 = chunk_28[0] + up_states_85 = chunk_28[1] + chunk_28 = None + silu_28 = torch.nn.functional.silu(gate_28, inplace=False) + gate_28 = None + up_states_86 = up_states_85 * silu_28 + up_states_85 = silu_28 = None + hidden_states_259 = torch._C._nn.linear( + up_states_86, + l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_86 = l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_57 = torch.nn.functional.dropout(hidden_states_259, 0.0, False, False) + hidden_states_259 = None + hidden_states_260 = hidden_states_255 + dropout_57 + hidden_states_255 = dropout_57 = None + hidden_states_261 = hidden_states_260.to(torch.float32) + pow_59 = hidden_states_261.pow(2) + variance_58 = pow_59.mean(-1, keepdim=True) + pow_59 = None + add_174 = variance_58 + 1e-05 + variance_58 = None + rsqrt_58 = torch.rsqrt(add_174) + add_174 = None + hidden_states_262 = hidden_states_261 * rsqrt_58 + hidden_states_261 = rsqrt_58 = None + to_117 = hidden_states_262.to(torch.bfloat16) + hidden_states_262 = None + hidden_states_263 = ( + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ + * to_117 + ) + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ = ( + to_117 + ) = None + qkv_29 = torch._C._nn.linear( + hidden_states_263, + l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_263 = l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_58 = qkv_29[(Ellipsis, slice(None, 3072, None))] + key_states_58 = qkv_29[(Ellipsis, slice(3072, 6144, None))] + value_states_58 = qkv_29[(Ellipsis, slice(6144, None, None))] + qkv_29 = None + view_87 = query_states_58.view((1, 2, -1, 96)) + query_states_58 = None + query_states_59 = view_87.transpose(1, 2) + view_87 = None + view_88 = key_states_58.view((1, 2, -1, 96)) + key_states_58 = None + key_states_59 = view_88.transpose(1, 2) + view_88 = None + view_89 = value_states_58.view((1, 2, -1, 96)) + value_states_58 = None + value_states_59 = view_89.transpose(1, 2) + view_89 = None + cos_29 = l_stack0_0_.unsqueeze(1) + sin_29 = l_stack0_1_.unsqueeze(1) + q_rot_29 = query_states_59[(Ellipsis, slice(None, 96, None))] + q_pass_29 = query_states_59[(Ellipsis, slice(96, None, None))] + query_states_59 = None + k_rot_29 = key_states_59[(Ellipsis, slice(None, 96, None))] + k_pass_29 = key_states_59[(Ellipsis, slice(96, None, None))] + key_states_59 = None + mul_263 = q_rot_29 * cos_29 + x1_58 = q_rot_29[(Ellipsis, slice(None, 48, None))] + x2_58 = q_rot_29[(Ellipsis, slice(48, None, None))] + q_rot_29 = None + neg_58 = -x2_58 + x2_58 = None + cat_116 = torch.cat((neg_58, x1_58), dim=-1) + neg_58 = x1_58 = None + mul_264 = cat_116 * sin_29 + cat_116 = None + add_175 = mul_263 + mul_264 + mul_263 = mul_264 = None + q_embed_29 = torch.cat([add_175, q_pass_29], dim=-1) + add_175 = q_pass_29 = None + mul_265 = k_rot_29 * cos_29 + cos_29 = None + x1_59 = k_rot_29[(Ellipsis, slice(None, 48, None))] + x2_59 = k_rot_29[(Ellipsis, slice(48, None, None))] + k_rot_29 = None + neg_59 = -x2_59 + x2_59 = None + cat_118 = torch.cat((neg_59, x1_59), dim=-1) + neg_59 = x1_59 = None + mul_266 = cat_118 * sin_29 + cat_118 = sin_29 = None + add_176 = mul_265 + mul_266 + mul_265 = mul_266 = None + k_embed_29 = torch.cat([add_176, k_pass_29], dim=-1) + add_176 = k_pass_29 = None + attention_mask_29 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_29 = q_embed_29.contiguous() + q_embed_29 = None + key_29 = k_embed_29.contiguous() + value_29 = value_states_59.contiguous() + attn_output_116 = torch._C._nn.scaled_dot_product_attention( + query_29, + key_29, + value_29, + attn_mask=attention_mask_29, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_29 = key_29 = value_29 = attention_mask_29 = None + transpose_119 = attn_output_116.transpose(1, 2) + attn_output_116 = None + attn_output_117 = transpose_119.contiguous() + transpose_119 = None + reshape_29 = attn_output_117.reshape(1, 2, -1) + attn_output_117 = None + attn_output_118 = reshape_29.contiguous() + reshape_29 = None + attn_output_119 = torch._C._nn.linear( + attn_output_118, + l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_118 = l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_58 = torch.nn.functional.dropout(attn_output_119, 0.0, False, False) + attn_output_119 = None + hidden_states_264 = hidden_states_260 + dropout_58 + hidden_states_260 = dropout_58 = None + hidden_states_265 = hidden_states_264.to(torch.float32) + pow_60 = hidden_states_265.pow(2) + variance_59 = pow_60.mean(-1, keepdim=True) + pow_60 = None + add_178 = variance_59 + 1e-05 + variance_59 = None + rsqrt_59 = torch.rsqrt(add_178) + add_178 = None + hidden_states_266 = hidden_states_265 * rsqrt_59 + hidden_states_265 = rsqrt_59 = None + to_119 = hidden_states_266.to(torch.bfloat16) + hidden_states_266 = None + hidden_states_267 = ( + l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ + * to_119 + ) + l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ = ( + to_119 + ) = None + up_states_87 = torch._C._nn.linear( + hidden_states_267, + l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_267 = l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_29 = up_states_87.chunk(2, dim=-1) + up_states_87 = None + gate_29 = chunk_29[0] + up_states_88 = chunk_29[1] + chunk_29 = None + silu_29 = torch.nn.functional.silu(gate_29, inplace=False) + gate_29 = None + up_states_89 = up_states_88 * silu_29 + up_states_88 = silu_29 = None + hidden_states_268 = torch._C._nn.linear( + up_states_89, + l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_89 = l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_59 = torch.nn.functional.dropout(hidden_states_268, 0.0, False, False) + hidden_states_268 = None + hidden_states_269 = hidden_states_264 + dropout_59 + hidden_states_264 = dropout_59 = None + hidden_states_270 = hidden_states_269.to(torch.float32) + pow_61 = hidden_states_270.pow(2) + variance_60 = pow_61.mean(-1, keepdim=True) + pow_61 = None + add_180 = variance_60 + 1e-05 + variance_60 = None + rsqrt_60 = torch.rsqrt(add_180) + add_180 = None + hidden_states_271 = hidden_states_270 * rsqrt_60 + hidden_states_270 = rsqrt_60 = None + to_121 = hidden_states_271.to(torch.bfloat16) + hidden_states_271 = None + hidden_states_272 = ( + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ + * to_121 + ) + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ = ( + to_121 + ) = None + qkv_30 = torch._C._nn.linear( + hidden_states_272, + l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_272 = l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_60 = qkv_30[(Ellipsis, slice(None, 3072, None))] + key_states_60 = qkv_30[(Ellipsis, slice(3072, 6144, None))] + value_states_60 = qkv_30[(Ellipsis, slice(6144, None, None))] + qkv_30 = None + view_90 = query_states_60.view((1, 2, -1, 96)) + query_states_60 = None + query_states_61 = view_90.transpose(1, 2) + view_90 = None + view_91 = key_states_60.view((1, 2, -1, 96)) + key_states_60 = None + key_states_61 = view_91.transpose(1, 2) + view_91 = None + view_92 = value_states_60.view((1, 2, -1, 96)) + value_states_60 = None + value_states_61 = view_92.transpose(1, 2) + view_92 = None + cos_30 = l_stack0_0_.unsqueeze(1) + sin_30 = l_stack0_1_.unsqueeze(1) + q_rot_30 = query_states_61[(Ellipsis, slice(None, 96, None))] + q_pass_30 = query_states_61[(Ellipsis, slice(96, None, None))] + query_states_61 = None + k_rot_30 = key_states_61[(Ellipsis, slice(None, 96, None))] + k_pass_30 = key_states_61[(Ellipsis, slice(96, None, None))] + key_states_61 = None + mul_272 = q_rot_30 * cos_30 + x1_60 = q_rot_30[(Ellipsis, slice(None, 48, None))] + x2_60 = q_rot_30[(Ellipsis, slice(48, None, None))] + q_rot_30 = None + neg_60 = -x2_60 + x2_60 = None + cat_120 = torch.cat((neg_60, x1_60), dim=-1) + neg_60 = x1_60 = None + mul_273 = cat_120 * sin_30 + cat_120 = None + add_181 = mul_272 + mul_273 + mul_272 = mul_273 = None + q_embed_30 = torch.cat([add_181, q_pass_30], dim=-1) + add_181 = q_pass_30 = None + mul_274 = k_rot_30 * cos_30 + cos_30 = None + x1_61 = k_rot_30[(Ellipsis, slice(None, 48, None))] + x2_61 = k_rot_30[(Ellipsis, slice(48, None, None))] + k_rot_30 = None + neg_61 = -x2_61 + x2_61 = None + cat_122 = torch.cat((neg_61, x1_61), dim=-1) + neg_61 = x1_61 = None + mul_275 = cat_122 * sin_30 + cat_122 = sin_30 = None + add_182 = mul_274 + mul_275 + mul_274 = mul_275 = None + k_embed_30 = torch.cat([add_182, k_pass_30], dim=-1) + add_182 = k_pass_30 = None + attention_mask_30 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_30 = q_embed_30.contiguous() + q_embed_30 = None + key_30 = k_embed_30.contiguous() + value_30 = value_states_61.contiguous() + attn_output_120 = torch._C._nn.scaled_dot_product_attention( + query_30, + key_30, + value_30, + attn_mask=attention_mask_30, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_30 = key_30 = value_30 = attention_mask_30 = None + transpose_123 = attn_output_120.transpose(1, 2) + attn_output_120 = None + attn_output_121 = transpose_123.contiguous() + transpose_123 = None + reshape_30 = attn_output_121.reshape(1, 2, -1) + attn_output_121 = None + attn_output_122 = reshape_30.contiguous() + reshape_30 = None + attn_output_123 = torch._C._nn.linear( + attn_output_122, + l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_122 = l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_60 = torch.nn.functional.dropout(attn_output_123, 0.0, False, False) + attn_output_123 = None + hidden_states_273 = hidden_states_269 + dropout_60 + hidden_states_269 = dropout_60 = None + hidden_states_274 = hidden_states_273.to(torch.float32) + pow_62 = hidden_states_274.pow(2) + variance_61 = pow_62.mean(-1, keepdim=True) + pow_62 = None + add_184 = variance_61 + 1e-05 + variance_61 = None + rsqrt_61 = torch.rsqrt(add_184) + add_184 = None + hidden_states_275 = hidden_states_274 * rsqrt_61 + hidden_states_274 = rsqrt_61 = None + to_123 = hidden_states_275.to(torch.bfloat16) + hidden_states_275 = None + hidden_states_276 = ( + l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ + * to_123 + ) + l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ = ( + to_123 + ) = None + up_states_90 = torch._C._nn.linear( + hidden_states_276, + l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_276 = l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_30 = up_states_90.chunk(2, dim=-1) + up_states_90 = None + gate_30 = chunk_30[0] + up_states_91 = chunk_30[1] + chunk_30 = None + silu_30 = torch.nn.functional.silu(gate_30, inplace=False) + gate_30 = None + up_states_92 = up_states_91 * silu_30 + up_states_91 = silu_30 = None + hidden_states_277 = torch._C._nn.linear( + up_states_92, + l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_92 = l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_61 = torch.nn.functional.dropout(hidden_states_277, 0.0, False, False) + hidden_states_277 = None + hidden_states_278 = hidden_states_273 + dropout_61 + hidden_states_273 = dropout_61 = None + hidden_states_279 = hidden_states_278.to(torch.float32) + pow_63 = hidden_states_279.pow(2) + variance_62 = pow_63.mean(-1, keepdim=True) + pow_63 = None + add_186 = variance_62 + 1e-05 + variance_62 = None + rsqrt_62 = torch.rsqrt(add_186) + add_186 = None + hidden_states_280 = hidden_states_279 * rsqrt_62 + hidden_states_279 = rsqrt_62 = None + to_125 = hidden_states_280.to(torch.bfloat16) + hidden_states_280 = None + hidden_states_281 = ( + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ + * to_125 + ) + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ = ( + to_125 + ) = None + qkv_31 = torch._C._nn.linear( + hidden_states_281, + l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_281 = l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_62 = qkv_31[(Ellipsis, slice(None, 3072, None))] + key_states_62 = qkv_31[(Ellipsis, slice(3072, 6144, None))] + value_states_62 = qkv_31[(Ellipsis, slice(6144, None, None))] + qkv_31 = None + view_93 = query_states_62.view((1, 2, -1, 96)) + query_states_62 = None + query_states_63 = view_93.transpose(1, 2) + view_93 = None + view_94 = key_states_62.view((1, 2, -1, 96)) + key_states_62 = None + key_states_63 = view_94.transpose(1, 2) + view_94 = None + view_95 = value_states_62.view((1, 2, -1, 96)) + value_states_62 = None + value_states_63 = view_95.transpose(1, 2) + view_95 = None + cos_31 = l_stack0_0_.unsqueeze(1) + l_stack0_0_ = None + sin_31 = l_stack0_1_.unsqueeze(1) + l_stack0_1_ = None + q_rot_31 = query_states_63[(Ellipsis, slice(None, 96, None))] + q_pass_31 = query_states_63[(Ellipsis, slice(96, None, None))] + query_states_63 = None + k_rot_31 = key_states_63[(Ellipsis, slice(None, 96, None))] + k_pass_31 = key_states_63[(Ellipsis, slice(96, None, None))] + key_states_63 = None + mul_281 = q_rot_31 * cos_31 + x1_62 = q_rot_31[(Ellipsis, slice(None, 48, None))] + x2_62 = q_rot_31[(Ellipsis, slice(48, None, None))] + q_rot_31 = None + neg_62 = -x2_62 + x2_62 = None + cat_124 = torch.cat((neg_62, x1_62), dim=-1) + neg_62 = x1_62 = None + mul_282 = cat_124 * sin_31 + cat_124 = None + add_187 = mul_281 + mul_282 + mul_281 = mul_282 = None + q_embed_31 = torch.cat([add_187, q_pass_31], dim=-1) + add_187 = q_pass_31 = None + mul_283 = k_rot_31 * cos_31 + cos_31 = None + x1_63 = k_rot_31[(Ellipsis, slice(None, 48, None))] + x2_63 = k_rot_31[(Ellipsis, slice(48, None, None))] + k_rot_31 = None + neg_63 = -x2_63 + x2_63 = None + cat_126 = torch.cat((neg_63, x1_63), dim=-1) + neg_63 = x1_63 = None + mul_284 = cat_126 * sin_31 + cat_126 = sin_31 = None + add_188 = mul_283 + mul_284 + mul_283 = mul_284 = None + k_embed_31 = torch.cat([add_188, k_pass_31], dim=-1) + add_188 = k_pass_31 = None + attention_mask_31 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + l_causal_mask_ = None + query_31 = q_embed_31.contiguous() + q_embed_31 = None + key_31 = k_embed_31.contiguous() + value_31 = value_states_63.contiguous() + attn_output_124 = torch._C._nn.scaled_dot_product_attention( + query_31, + key_31, + value_31, + attn_mask=attention_mask_31, + dropout_p=0.0, + scale=0.10206207261596575, + is_causal=False, + ) + query_31 = key_31 = value_31 = attention_mask_31 = None + transpose_127 = attn_output_124.transpose(1, 2) + attn_output_124 = None + attn_output_125 = transpose_127.contiguous() + transpose_127 = None + reshape_31 = attn_output_125.reshape(1, 2, -1) + attn_output_125 = None + attn_output_126 = reshape_31.contiguous() + reshape_31 = None + attn_output_127 = torch._C._nn.linear( + attn_output_126, + l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_126 = l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_62 = torch.nn.functional.dropout(attn_output_127, 0.0, False, False) + attn_output_127 = None + hidden_states_282 = hidden_states_278 + dropout_62 + hidden_states_278 = dropout_62 = None + hidden_states_283 = hidden_states_282.to(torch.float32) + pow_64 = hidden_states_283.pow(2) + variance_63 = pow_64.mean(-1, keepdim=True) + pow_64 = None + add_190 = variance_63 + 1e-05 + variance_63 = None + rsqrt_63 = torch.rsqrt(add_190) + add_190 = None + hidden_states_284 = hidden_states_283 * rsqrt_63 + hidden_states_283 = rsqrt_63 = None + to_127 = hidden_states_284.to(torch.bfloat16) + hidden_states_284 = None + hidden_states_285 = ( + l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ + * to_127 + ) + l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ = ( + to_127 + ) = None + up_states_93 = torch._C._nn.linear( + hidden_states_285, + l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_285 = l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_31 = up_states_93.chunk(2, dim=-1) + up_states_93 = None + gate_31 = chunk_31[0] + up_states_94 = chunk_31[1] + chunk_31 = None + silu_31 = torch.nn.functional.silu(gate_31, inplace=False) + gate_31 = None + up_states_95 = up_states_94 * silu_31 + up_states_94 = silu_31 = None + hidden_states_286 = torch._C._nn.linear( + up_states_95, + l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_95 = l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_63 = torch.nn.functional.dropout(hidden_states_286, 0.0, False, False) + hidden_states_286 = None + hidden_states_287 = hidden_states_282 + dropout_63 + hidden_states_282 = dropout_63 = None + hidden_states_288 = hidden_states_287.to(torch.float32) + hidden_states_287 = None + pow_65 = hidden_states_288.pow(2) + variance_64 = pow_65.mean(-1, keepdim=True) + pow_65 = None + add_192 = variance_64 + 1e-05 + variance_64 = None + rsqrt_64 = torch.rsqrt(add_192) + add_192 = None + hidden_states_289 = hidden_states_288 * rsqrt_64 + hidden_states_288 = rsqrt_64 = None + to_129 = hidden_states_289.to(torch.bfloat16) + hidden_states_289 = None + hidden_states_290 = l_self_modules_norm_parameters_weight_ * to_129 + l_self_modules_norm_parameters_weight_ = to_129 = None + return ( + value_states_1, + k_embed, + value_states_3, + k_embed_1, + value_states_5, + k_embed_2, + value_states_7, + k_embed_3, + value_states_9, + k_embed_4, + value_states_11, + k_embed_5, + value_states_13, + k_embed_6, + value_states_15, + k_embed_7, + value_states_17, + k_embed_8, + value_states_19, + k_embed_9, + value_states_21, + k_embed_10, + value_states_23, + k_embed_11, + value_states_25, + k_embed_12, + value_states_27, + k_embed_13, + value_states_29, + k_embed_14, + value_states_31, + k_embed_15, + value_states_33, + k_embed_16, + value_states_35, + k_embed_17, + value_states_37, + k_embed_18, + value_states_39, + k_embed_19, + value_states_41, + k_embed_20, + value_states_43, + k_embed_21, + value_states_45, + k_embed_22, + value_states_47, + k_embed_23, + value_states_49, + k_embed_24, + value_states_51, + k_embed_25, + value_states_53, + k_embed_26, + value_states_55, + k_embed_27, + value_states_57, + k_embed_28, + value_states_59, + k_embed_29, + value_states_61, + k_embed_30, + value_states_63, + k_embed_31, + hidden_states_290, + ) diff --git a/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/weight_meta.py b/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/weight_meta.py new file mode 100644 index 000000000..c138b6cf3 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/Phi-3.5-mini-instruct/weight_meta.py @@ -0,0 +1,1968 @@ +class Program_weight_tensor_meta_L_hidden_states_: + name = "L_hidden_states_" + shape = [1, 2, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_stack0_0_: + name = "L_stack0_0_" + shape = [1, 2, 96] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.172 + std = 0.074 + data = None + + +class Program_weight_tensor_meta_L_stack0_1_: + name = "L_stack0_1_" + shape = [1, 2, 96] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.062 + std = 0.181 + data = None + + +class Program_weight_tensor_meta_L_causal_mask_: + name = "L_causal_mask_" + shape = [1, 1, 2, 2] + dtype = "torch.bool" + device = "cuda:0" + mean = None + std = None + data = [True, False, True, True] + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [9216, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_norm_parameters_weight_: + name = "L_self_modules_norm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/graph_hash.txt b/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/graph_hash.txt new file mode 100644 index 000000000..8565ba660 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/graph_hash.txt @@ -0,0 +1 @@ +a03b91d07adde4eab2e0a43959e210f2b1be3fe3d7ce59472d9debaad477b480 \ No newline at end of file diff --git a/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/graph_net.json b/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/input_meta.py b/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/input_tensor_constraints.py b/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/model.py b/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/model.py new file mode 100644 index 000000000..2af28ecf3 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/model.py @@ -0,0 +1,6632 @@ +import torch + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_hidden_states_: torch.Tensor, + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_stack0_0_: torch.Tensor, + L_stack0_1_: torch.Tensor, + L_causal_mask_: torch.Tensor, + L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_norm_parameters_weight_: torch.nn.parameter.Parameter, + ): + l_hidden_states_ = L_hidden_states_ + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_stack0_0_ = L_stack0_0_ + l_stack0_1_ = L_stack0_1_ + l_causal_mask_ = L_causal_mask_ + l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_norm_parameters_weight_ = L_self_modules_norm_parameters_weight_ + _log_api_usage_once = torch._C._log_api_usage_once("python.nn_module") + _log_api_usage_once = None + hidden_states = l_hidden_states_.to(torch.float32) + pow_1 = hidden_states.pow(2) + variance = pow_1.mean(-1, keepdim=True) + pow_1 = None + add = variance + 1e-05 + variance = None + rsqrt = torch.rsqrt(add) + add = None + hidden_states_1 = hidden_states * rsqrt + hidden_states = rsqrt = None + to_1 = hidden_states_1.to(torch.bfloat16) + hidden_states_1 = None + hidden_states_2 = ( + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + * to_1 + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + to_1 + ) = None + qkv = torch._C._nn.linear( + hidden_states_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_2 = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states = qkv[(Ellipsis, slice(None, 3072, None))] + key_states = qkv[(Ellipsis, slice(3072, 4096, None))] + value_states = qkv[(Ellipsis, slice(4096, None, None))] + qkv = None + view = query_states.view((1, 2, -1, 128)) + query_states = None + query_states_1 = view.transpose(1, 2) + view = None + view_1 = key_states.view((1, 2, -1, 128)) + key_states = None + key_states_1 = view_1.transpose(1, 2) + view_1 = None + view_2 = value_states.view((1, 2, -1, 128)) + value_states = None + value_states_1 = view_2.transpose(1, 2) + view_2 = None + cos = l_stack0_0_.unsqueeze(1) + sin = l_stack0_1_.unsqueeze(1) + q_rot = query_states_1[(Ellipsis, slice(None, 96, None))] + q_pass = query_states_1[(Ellipsis, slice(96, None, None))] + query_states_1 = None + k_rot = key_states_1[(Ellipsis, slice(None, 96, None))] + k_pass = key_states_1[(Ellipsis, slice(96, None, None))] + key_states_1 = None + mul_2 = q_rot * cos + x1 = q_rot[(Ellipsis, slice(None, 48, None))] + x2 = q_rot[(Ellipsis, slice(48, None, None))] + q_rot = None + neg = -x2 + x2 = None + cat = torch.cat((neg, x1), dim=-1) + neg = x1 = None + mul_3 = cat * sin + cat = None + add_1 = mul_2 + mul_3 + mul_2 = mul_3 = None + q_embed = torch.cat([add_1, q_pass], dim=-1) + add_1 = q_pass = None + mul_4 = k_rot * cos + cos = None + x1_1 = k_rot[(Ellipsis, slice(None, 48, None))] + x2_1 = k_rot[(Ellipsis, slice(48, None, None))] + k_rot = None + neg_1 = -x2_1 + x2_1 = None + cat_2 = torch.cat((neg_1, x1_1), dim=-1) + neg_1 = x1_1 = None + mul_5 = cat_2 * sin + cat_2 = sin = None + add_2 = mul_4 + mul_5 + mul_4 = mul_5 = None + k_embed = torch.cat([add_2, k_pass], dim=-1) + add_2 = k_pass = None + getitem_11 = k_embed[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_3 = getitem_11.expand(1, 8, 3, 2, 128) + getitem_11 = None + key = hidden_states_3.reshape(1, 24, 2, 128) + hidden_states_3 = None + getitem_12 = value_states_1[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_4 = getitem_12.expand(1, 8, 3, 2, 128) + getitem_12 = None + value = hidden_states_4.reshape(1, 24, 2, 128) + hidden_states_4 = None + attention_mask = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query = q_embed.contiguous() + q_embed = None + key_1 = key.contiguous() + key = None + value_1 = value.contiguous() + value = None + attn_output = torch._C._nn.scaled_dot_product_attention( + query, + key_1, + value_1, + attn_mask=attention_mask, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query = key_1 = value_1 = attention_mask = None + transpose_3 = attn_output.transpose(1, 2) + attn_output = None + attn_output_1 = transpose_3.contiguous() + transpose_3 = None + reshape_2 = attn_output_1.reshape(1, 2, -1) + attn_output_1 = None + attn_output_2 = reshape_2.contiguous() + reshape_2 = None + attn_output_3 = torch._C._nn.linear( + attn_output_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_2 = l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout = torch.nn.functional.dropout(attn_output_3, 0.0, False, False) + attn_output_3 = None + hidden_states_5 = l_hidden_states_ + dropout + l_hidden_states_ = dropout = None + hidden_states_6 = hidden_states_5.to(torch.float32) + pow_2 = hidden_states_6.pow(2) + variance_1 = pow_2.mean(-1, keepdim=True) + pow_2 = None + add_4 = variance_1 + 1e-05 + variance_1 = None + rsqrt_1 = torch.rsqrt(add_4) + add_4 = None + hidden_states_7 = hidden_states_6 * rsqrt_1 + hidden_states_6 = rsqrt_1 = None + to_3 = hidden_states_7.to(torch.bfloat16) + hidden_states_7 = None + hidden_states_8 = ( + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + * to_3 + ) + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = ( + to_3 + ) = None + up_states = torch._C._nn.linear( + hidden_states_8, + l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_8 = l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk = up_states.chunk(2, dim=-1) + up_states = None + gate = chunk[0] + up_states_1 = chunk[1] + chunk = None + silu = torch.nn.functional.silu(gate, inplace=False) + gate = None + up_states_2 = up_states_1 * silu + up_states_1 = silu = None + hidden_states_9 = torch._C._nn.linear( + up_states_2, + l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_2 = l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_1 = torch.nn.functional.dropout(hidden_states_9, 0.0, False, False) + hidden_states_9 = None + hidden_states_10 = hidden_states_5 + dropout_1 + hidden_states_5 = dropout_1 = None + hidden_states_11 = hidden_states_10.to(torch.float32) + pow_3 = hidden_states_11.pow(2) + variance_2 = pow_3.mean(-1, keepdim=True) + pow_3 = None + add_6 = variance_2 + 1e-05 + variance_2 = None + rsqrt_2 = torch.rsqrt(add_6) + add_6 = None + hidden_states_12 = hidden_states_11 * rsqrt_2 + hidden_states_11 = rsqrt_2 = None + to_5 = hidden_states_12.to(torch.bfloat16) + hidden_states_12 = None + hidden_states_13 = ( + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + * to_5 + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + to_5 + ) = None + qkv_1 = torch._C._nn.linear( + hidden_states_13, + l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_13 = l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_2 = qkv_1[(Ellipsis, slice(None, 3072, None))] + key_states_2 = qkv_1[(Ellipsis, slice(3072, 4096, None))] + value_states_2 = qkv_1[(Ellipsis, slice(4096, None, None))] + qkv_1 = None + view_3 = query_states_2.view((1, 2, -1, 128)) + query_states_2 = None + query_states_3 = view_3.transpose(1, 2) + view_3 = None + view_4 = key_states_2.view((1, 2, -1, 128)) + key_states_2 = None + key_states_3 = view_4.transpose(1, 2) + view_4 = None + view_5 = value_states_2.view((1, 2, -1, 128)) + value_states_2 = None + value_states_3 = view_5.transpose(1, 2) + view_5 = None + cos_1 = l_stack0_0_.unsqueeze(1) + sin_1 = l_stack0_1_.unsqueeze(1) + q_rot_1 = query_states_3[(Ellipsis, slice(None, 96, None))] + q_pass_1 = query_states_3[(Ellipsis, slice(96, None, None))] + query_states_3 = None + k_rot_1 = key_states_3[(Ellipsis, slice(None, 96, None))] + k_pass_1 = key_states_3[(Ellipsis, slice(96, None, None))] + key_states_3 = None + mul_11 = q_rot_1 * cos_1 + x1_2 = q_rot_1[(Ellipsis, slice(None, 48, None))] + x2_2 = q_rot_1[(Ellipsis, slice(48, None, None))] + q_rot_1 = None + neg_2 = -x2_2 + x2_2 = None + cat_4 = torch.cat((neg_2, x1_2), dim=-1) + neg_2 = x1_2 = None + mul_12 = cat_4 * sin_1 + cat_4 = None + add_7 = mul_11 + mul_12 + mul_11 = mul_12 = None + q_embed_1 = torch.cat([add_7, q_pass_1], dim=-1) + add_7 = q_pass_1 = None + mul_13 = k_rot_1 * cos_1 + cos_1 = None + x1_3 = k_rot_1[(Ellipsis, slice(None, 48, None))] + x2_3 = k_rot_1[(Ellipsis, slice(48, None, None))] + k_rot_1 = None + neg_3 = -x2_3 + x2_3 = None + cat_6 = torch.cat((neg_3, x1_3), dim=-1) + neg_3 = x1_3 = None + mul_14 = cat_6 * sin_1 + cat_6 = sin_1 = None + add_8 = mul_13 + mul_14 + mul_13 = mul_14 = None + k_embed_1 = torch.cat([add_8, k_pass_1], dim=-1) + add_8 = k_pass_1 = None + getitem_27 = k_embed_1[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_14 = getitem_27.expand(1, 8, 3, 2, 128) + getitem_27 = None + key_2 = hidden_states_14.reshape(1, 24, 2, 128) + hidden_states_14 = None + getitem_28 = value_states_3[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_15 = getitem_28.expand(1, 8, 3, 2, 128) + getitem_28 = None + value_2 = hidden_states_15.reshape(1, 24, 2, 128) + hidden_states_15 = None + attention_mask_1 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_1 = q_embed_1.contiguous() + q_embed_1 = None + key_3 = key_2.contiguous() + key_2 = None + value_3 = value_2.contiguous() + value_2 = None + attn_output_4 = torch._C._nn.scaled_dot_product_attention( + query_1, + key_3, + value_3, + attn_mask=attention_mask_1, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_1 = key_3 = value_3 = attention_mask_1 = None + transpose_7 = attn_output_4.transpose(1, 2) + attn_output_4 = None + attn_output_5 = transpose_7.contiguous() + transpose_7 = None + reshape_5 = attn_output_5.reshape(1, 2, -1) + attn_output_5 = None + attn_output_6 = reshape_5.contiguous() + reshape_5 = None + attn_output_7 = torch._C._nn.linear( + attn_output_6, + l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_6 = l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_2 = torch.nn.functional.dropout(attn_output_7, 0.0, False, False) + attn_output_7 = None + hidden_states_16 = hidden_states_10 + dropout_2 + hidden_states_10 = dropout_2 = None + hidden_states_17 = hidden_states_16.to(torch.float32) + pow_4 = hidden_states_17.pow(2) + variance_3 = pow_4.mean(-1, keepdim=True) + pow_4 = None + add_10 = variance_3 + 1e-05 + variance_3 = None + rsqrt_3 = torch.rsqrt(add_10) + add_10 = None + hidden_states_18 = hidden_states_17 * rsqrt_3 + hidden_states_17 = rsqrt_3 = None + to_7 = hidden_states_18.to(torch.bfloat16) + hidden_states_18 = None + hidden_states_19 = ( + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + * to_7 + ) + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = ( + to_7 + ) = None + up_states_3 = torch._C._nn.linear( + hidden_states_19, + l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_19 = l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_1 = up_states_3.chunk(2, dim=-1) + up_states_3 = None + gate_1 = chunk_1[0] + up_states_4 = chunk_1[1] + chunk_1 = None + silu_1 = torch.nn.functional.silu(gate_1, inplace=False) + gate_1 = None + up_states_5 = up_states_4 * silu_1 + up_states_4 = silu_1 = None + hidden_states_20 = torch._C._nn.linear( + up_states_5, + l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_5 = l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_3 = torch.nn.functional.dropout(hidden_states_20, 0.0, False, False) + hidden_states_20 = None + hidden_states_21 = hidden_states_16 + dropout_3 + hidden_states_16 = dropout_3 = None + hidden_states_22 = hidden_states_21.to(torch.float32) + pow_5 = hidden_states_22.pow(2) + variance_4 = pow_5.mean(-1, keepdim=True) + pow_5 = None + add_12 = variance_4 + 1e-05 + variance_4 = None + rsqrt_4 = torch.rsqrt(add_12) + add_12 = None + hidden_states_23 = hidden_states_22 * rsqrt_4 + hidden_states_22 = rsqrt_4 = None + to_9 = hidden_states_23.to(torch.bfloat16) + hidden_states_23 = None + hidden_states_24 = ( + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + * to_9 + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + to_9 + ) = None + qkv_2 = torch._C._nn.linear( + hidden_states_24, + l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_24 = l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_4 = qkv_2[(Ellipsis, slice(None, 3072, None))] + key_states_4 = qkv_2[(Ellipsis, slice(3072, 4096, None))] + value_states_4 = qkv_2[(Ellipsis, slice(4096, None, None))] + qkv_2 = None + view_6 = query_states_4.view((1, 2, -1, 128)) + query_states_4 = None + query_states_5 = view_6.transpose(1, 2) + view_6 = None + view_7 = key_states_4.view((1, 2, -1, 128)) + key_states_4 = None + key_states_5 = view_7.transpose(1, 2) + view_7 = None + view_8 = value_states_4.view((1, 2, -1, 128)) + value_states_4 = None + value_states_5 = view_8.transpose(1, 2) + view_8 = None + cos_2 = l_stack0_0_.unsqueeze(1) + sin_2 = l_stack0_1_.unsqueeze(1) + q_rot_2 = query_states_5[(Ellipsis, slice(None, 96, None))] + q_pass_2 = query_states_5[(Ellipsis, slice(96, None, None))] + query_states_5 = None + k_rot_2 = key_states_5[(Ellipsis, slice(None, 96, None))] + k_pass_2 = key_states_5[(Ellipsis, slice(96, None, None))] + key_states_5 = None + mul_20 = q_rot_2 * cos_2 + x1_4 = q_rot_2[(Ellipsis, slice(None, 48, None))] + x2_4 = q_rot_2[(Ellipsis, slice(48, None, None))] + q_rot_2 = None + neg_4 = -x2_4 + x2_4 = None + cat_8 = torch.cat((neg_4, x1_4), dim=-1) + neg_4 = x1_4 = None + mul_21 = cat_8 * sin_2 + cat_8 = None + add_13 = mul_20 + mul_21 + mul_20 = mul_21 = None + q_embed_2 = torch.cat([add_13, q_pass_2], dim=-1) + add_13 = q_pass_2 = None + mul_22 = k_rot_2 * cos_2 + cos_2 = None + x1_5 = k_rot_2[(Ellipsis, slice(None, 48, None))] + x2_5 = k_rot_2[(Ellipsis, slice(48, None, None))] + k_rot_2 = None + neg_5 = -x2_5 + x2_5 = None + cat_10 = torch.cat((neg_5, x1_5), dim=-1) + neg_5 = x1_5 = None + mul_23 = cat_10 * sin_2 + cat_10 = sin_2 = None + add_14 = mul_22 + mul_23 + mul_22 = mul_23 = None + k_embed_2 = torch.cat([add_14, k_pass_2], dim=-1) + add_14 = k_pass_2 = None + getitem_43 = k_embed_2[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_25 = getitem_43.expand(1, 8, 3, 2, 128) + getitem_43 = None + key_4 = hidden_states_25.reshape(1, 24, 2, 128) + hidden_states_25 = None + getitem_44 = value_states_5[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_26 = getitem_44.expand(1, 8, 3, 2, 128) + getitem_44 = None + value_4 = hidden_states_26.reshape(1, 24, 2, 128) + hidden_states_26 = None + attention_mask_2 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_2 = q_embed_2.contiguous() + q_embed_2 = None + key_5 = key_4.contiguous() + key_4 = None + value_5 = value_4.contiguous() + value_4 = None + attn_output_8 = torch._C._nn.scaled_dot_product_attention( + query_2, + key_5, + value_5, + attn_mask=attention_mask_2, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_2 = key_5 = value_5 = attention_mask_2 = None + transpose_11 = attn_output_8.transpose(1, 2) + attn_output_8 = None + attn_output_9 = transpose_11.contiguous() + transpose_11 = None + reshape_8 = attn_output_9.reshape(1, 2, -1) + attn_output_9 = None + attn_output_10 = reshape_8.contiguous() + reshape_8 = None + attn_output_11 = torch._C._nn.linear( + attn_output_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_10 = l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_4 = torch.nn.functional.dropout(attn_output_11, 0.0, False, False) + attn_output_11 = None + hidden_states_27 = hidden_states_21 + dropout_4 + hidden_states_21 = dropout_4 = None + hidden_states_28 = hidden_states_27.to(torch.float32) + pow_6 = hidden_states_28.pow(2) + variance_5 = pow_6.mean(-1, keepdim=True) + pow_6 = None + add_16 = variance_5 + 1e-05 + variance_5 = None + rsqrt_5 = torch.rsqrt(add_16) + add_16 = None + hidden_states_29 = hidden_states_28 * rsqrt_5 + hidden_states_28 = rsqrt_5 = None + to_11 = hidden_states_29.to(torch.bfloat16) + hidden_states_29 = None + hidden_states_30 = ( + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + * to_11 + ) + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = ( + to_11 + ) = None + up_states_6 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_30 = l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_2 = up_states_6.chunk(2, dim=-1) + up_states_6 = None + gate_2 = chunk_2[0] + up_states_7 = chunk_2[1] + chunk_2 = None + silu_2 = torch.nn.functional.silu(gate_2, inplace=False) + gate_2 = None + up_states_8 = up_states_7 * silu_2 + up_states_7 = silu_2 = None + hidden_states_31 = torch._C._nn.linear( + up_states_8, + l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_8 = l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_5 = torch.nn.functional.dropout(hidden_states_31, 0.0, False, False) + hidden_states_31 = None + hidden_states_32 = hidden_states_27 + dropout_5 + hidden_states_27 = dropout_5 = None + hidden_states_33 = hidden_states_32.to(torch.float32) + pow_7 = hidden_states_33.pow(2) + variance_6 = pow_7.mean(-1, keepdim=True) + pow_7 = None + add_18 = variance_6 + 1e-05 + variance_6 = None + rsqrt_6 = torch.rsqrt(add_18) + add_18 = None + hidden_states_34 = hidden_states_33 * rsqrt_6 + hidden_states_33 = rsqrt_6 = None + to_13 = hidden_states_34.to(torch.bfloat16) + hidden_states_34 = None + hidden_states_35 = ( + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + * to_13 + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + to_13 + ) = None + qkv_3 = torch._C._nn.linear( + hidden_states_35, + l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_35 = l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_6 = qkv_3[(Ellipsis, slice(None, 3072, None))] + key_states_6 = qkv_3[(Ellipsis, slice(3072, 4096, None))] + value_states_6 = qkv_3[(Ellipsis, slice(4096, None, None))] + qkv_3 = None + view_9 = query_states_6.view((1, 2, -1, 128)) + query_states_6 = None + query_states_7 = view_9.transpose(1, 2) + view_9 = None + view_10 = key_states_6.view((1, 2, -1, 128)) + key_states_6 = None + key_states_7 = view_10.transpose(1, 2) + view_10 = None + view_11 = value_states_6.view((1, 2, -1, 128)) + value_states_6 = None + value_states_7 = view_11.transpose(1, 2) + view_11 = None + cos_3 = l_stack0_0_.unsqueeze(1) + sin_3 = l_stack0_1_.unsqueeze(1) + q_rot_3 = query_states_7[(Ellipsis, slice(None, 96, None))] + q_pass_3 = query_states_7[(Ellipsis, slice(96, None, None))] + query_states_7 = None + k_rot_3 = key_states_7[(Ellipsis, slice(None, 96, None))] + k_pass_3 = key_states_7[(Ellipsis, slice(96, None, None))] + key_states_7 = None + mul_29 = q_rot_3 * cos_3 + x1_6 = q_rot_3[(Ellipsis, slice(None, 48, None))] + x2_6 = q_rot_3[(Ellipsis, slice(48, None, None))] + q_rot_3 = None + neg_6 = -x2_6 + x2_6 = None + cat_12 = torch.cat((neg_6, x1_6), dim=-1) + neg_6 = x1_6 = None + mul_30 = cat_12 * sin_3 + cat_12 = None + add_19 = mul_29 + mul_30 + mul_29 = mul_30 = None + q_embed_3 = torch.cat([add_19, q_pass_3], dim=-1) + add_19 = q_pass_3 = None + mul_31 = k_rot_3 * cos_3 + cos_3 = None + x1_7 = k_rot_3[(Ellipsis, slice(None, 48, None))] + x2_7 = k_rot_3[(Ellipsis, slice(48, None, None))] + k_rot_3 = None + neg_7 = -x2_7 + x2_7 = None + cat_14 = torch.cat((neg_7, x1_7), dim=-1) + neg_7 = x1_7 = None + mul_32 = cat_14 * sin_3 + cat_14 = sin_3 = None + add_20 = mul_31 + mul_32 + mul_31 = mul_32 = None + k_embed_3 = torch.cat([add_20, k_pass_3], dim=-1) + add_20 = k_pass_3 = None + getitem_59 = k_embed_3[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_36 = getitem_59.expand(1, 8, 3, 2, 128) + getitem_59 = None + key_6 = hidden_states_36.reshape(1, 24, 2, 128) + hidden_states_36 = None + getitem_60 = value_states_7[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_37 = getitem_60.expand(1, 8, 3, 2, 128) + getitem_60 = None + value_6 = hidden_states_37.reshape(1, 24, 2, 128) + hidden_states_37 = None + attention_mask_3 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_3 = q_embed_3.contiguous() + q_embed_3 = None + key_7 = key_6.contiguous() + key_6 = None + value_7 = value_6.contiguous() + value_6 = None + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_3, + key_7, + value_7, + attn_mask=attention_mask_3, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_3 = key_7 = value_7 = attention_mask_3 = None + transpose_15 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_13 = transpose_15.contiguous() + transpose_15 = None + reshape_11 = attn_output_13.reshape(1, 2, -1) + attn_output_13 = None + attn_output_14 = reshape_11.contiguous() + reshape_11 = None + attn_output_15 = torch._C._nn.linear( + attn_output_14, + l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_14 = l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_6 = torch.nn.functional.dropout(attn_output_15, 0.0, False, False) + attn_output_15 = None + hidden_states_38 = hidden_states_32 + dropout_6 + hidden_states_32 = dropout_6 = None + hidden_states_39 = hidden_states_38.to(torch.float32) + pow_8 = hidden_states_39.pow(2) + variance_7 = pow_8.mean(-1, keepdim=True) + pow_8 = None + add_22 = variance_7 + 1e-05 + variance_7 = None + rsqrt_7 = torch.rsqrt(add_22) + add_22 = None + hidden_states_40 = hidden_states_39 * rsqrt_7 + hidden_states_39 = rsqrt_7 = None + to_15 = hidden_states_40.to(torch.bfloat16) + hidden_states_40 = None + hidden_states_41 = ( + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + * to_15 + ) + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = ( + to_15 + ) = None + up_states_9 = torch._C._nn.linear( + hidden_states_41, + l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_41 = l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_3 = up_states_9.chunk(2, dim=-1) + up_states_9 = None + gate_3 = chunk_3[0] + up_states_10 = chunk_3[1] + chunk_3 = None + silu_3 = torch.nn.functional.silu(gate_3, inplace=False) + gate_3 = None + up_states_11 = up_states_10 * silu_3 + up_states_10 = silu_3 = None + hidden_states_42 = torch._C._nn.linear( + up_states_11, + l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_11 = l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_7 = torch.nn.functional.dropout(hidden_states_42, 0.0, False, False) + hidden_states_42 = None + hidden_states_43 = hidden_states_38 + dropout_7 + hidden_states_38 = dropout_7 = None + hidden_states_44 = hidden_states_43.to(torch.float32) + pow_9 = hidden_states_44.pow(2) + variance_8 = pow_9.mean(-1, keepdim=True) + pow_9 = None + add_24 = variance_8 + 1e-05 + variance_8 = None + rsqrt_8 = torch.rsqrt(add_24) + add_24 = None + hidden_states_45 = hidden_states_44 * rsqrt_8 + hidden_states_44 = rsqrt_8 = None + to_17 = hidden_states_45.to(torch.bfloat16) + hidden_states_45 = None + hidden_states_46 = ( + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + * to_17 + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + to_17 + ) = None + qkv_4 = torch._C._nn.linear( + hidden_states_46, + l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_46 = l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_8 = qkv_4[(Ellipsis, slice(None, 3072, None))] + key_states_8 = qkv_4[(Ellipsis, slice(3072, 4096, None))] + value_states_8 = qkv_4[(Ellipsis, slice(4096, None, None))] + qkv_4 = None + view_12 = query_states_8.view((1, 2, -1, 128)) + query_states_8 = None + query_states_9 = view_12.transpose(1, 2) + view_12 = None + view_13 = key_states_8.view((1, 2, -1, 128)) + key_states_8 = None + key_states_9 = view_13.transpose(1, 2) + view_13 = None + view_14 = value_states_8.view((1, 2, -1, 128)) + value_states_8 = None + value_states_9 = view_14.transpose(1, 2) + view_14 = None + cos_4 = l_stack0_0_.unsqueeze(1) + sin_4 = l_stack0_1_.unsqueeze(1) + q_rot_4 = query_states_9[(Ellipsis, slice(None, 96, None))] + q_pass_4 = query_states_9[(Ellipsis, slice(96, None, None))] + query_states_9 = None + k_rot_4 = key_states_9[(Ellipsis, slice(None, 96, None))] + k_pass_4 = key_states_9[(Ellipsis, slice(96, None, None))] + key_states_9 = None + mul_38 = q_rot_4 * cos_4 + x1_8 = q_rot_4[(Ellipsis, slice(None, 48, None))] + x2_8 = q_rot_4[(Ellipsis, slice(48, None, None))] + q_rot_4 = None + neg_8 = -x2_8 + x2_8 = None + cat_16 = torch.cat((neg_8, x1_8), dim=-1) + neg_8 = x1_8 = None + mul_39 = cat_16 * sin_4 + cat_16 = None + add_25 = mul_38 + mul_39 + mul_38 = mul_39 = None + q_embed_4 = torch.cat([add_25, q_pass_4], dim=-1) + add_25 = q_pass_4 = None + mul_40 = k_rot_4 * cos_4 + cos_4 = None + x1_9 = k_rot_4[(Ellipsis, slice(None, 48, None))] + x2_9 = k_rot_4[(Ellipsis, slice(48, None, None))] + k_rot_4 = None + neg_9 = -x2_9 + x2_9 = None + cat_18 = torch.cat((neg_9, x1_9), dim=-1) + neg_9 = x1_9 = None + mul_41 = cat_18 * sin_4 + cat_18 = sin_4 = None + add_26 = mul_40 + mul_41 + mul_40 = mul_41 = None + k_embed_4 = torch.cat([add_26, k_pass_4], dim=-1) + add_26 = k_pass_4 = None + getitem_75 = k_embed_4[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_47 = getitem_75.expand(1, 8, 3, 2, 128) + getitem_75 = None + key_8 = hidden_states_47.reshape(1, 24, 2, 128) + hidden_states_47 = None + getitem_76 = value_states_9[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_48 = getitem_76.expand(1, 8, 3, 2, 128) + getitem_76 = None + value_8 = hidden_states_48.reshape(1, 24, 2, 128) + hidden_states_48 = None + attention_mask_4 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_4 = q_embed_4.contiguous() + q_embed_4 = None + key_9 = key_8.contiguous() + key_8 = None + value_9 = value_8.contiguous() + value_8 = None + attn_output_16 = torch._C._nn.scaled_dot_product_attention( + query_4, + key_9, + value_9, + attn_mask=attention_mask_4, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_4 = key_9 = value_9 = attention_mask_4 = None + transpose_19 = attn_output_16.transpose(1, 2) + attn_output_16 = None + attn_output_17 = transpose_19.contiguous() + transpose_19 = None + reshape_14 = attn_output_17.reshape(1, 2, -1) + attn_output_17 = None + attn_output_18 = reshape_14.contiguous() + reshape_14 = None + attn_output_19 = torch._C._nn.linear( + attn_output_18, + l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_18 = l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_8 = torch.nn.functional.dropout(attn_output_19, 0.0, False, False) + attn_output_19 = None + hidden_states_49 = hidden_states_43 + dropout_8 + hidden_states_43 = dropout_8 = None + hidden_states_50 = hidden_states_49.to(torch.float32) + pow_10 = hidden_states_50.pow(2) + variance_9 = pow_10.mean(-1, keepdim=True) + pow_10 = None + add_28 = variance_9 + 1e-05 + variance_9 = None + rsqrt_9 = torch.rsqrt(add_28) + add_28 = None + hidden_states_51 = hidden_states_50 * rsqrt_9 + hidden_states_50 = rsqrt_9 = None + to_19 = hidden_states_51.to(torch.bfloat16) + hidden_states_51 = None + hidden_states_52 = ( + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + * to_19 + ) + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = ( + to_19 + ) = None + up_states_12 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_52 = l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_4 = up_states_12.chunk(2, dim=-1) + up_states_12 = None + gate_4 = chunk_4[0] + up_states_13 = chunk_4[1] + chunk_4 = None + silu_4 = torch.nn.functional.silu(gate_4, inplace=False) + gate_4 = None + up_states_14 = up_states_13 * silu_4 + up_states_13 = silu_4 = None + hidden_states_53 = torch._C._nn.linear( + up_states_14, + l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_14 = l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_9 = torch.nn.functional.dropout(hidden_states_53, 0.0, False, False) + hidden_states_53 = None + hidden_states_54 = hidden_states_49 + dropout_9 + hidden_states_49 = dropout_9 = None + hidden_states_55 = hidden_states_54.to(torch.float32) + pow_11 = hidden_states_55.pow(2) + variance_10 = pow_11.mean(-1, keepdim=True) + pow_11 = None + add_30 = variance_10 + 1e-05 + variance_10 = None + rsqrt_10 = torch.rsqrt(add_30) + add_30 = None + hidden_states_56 = hidden_states_55 * rsqrt_10 + hidden_states_55 = rsqrt_10 = None + to_21 = hidden_states_56.to(torch.bfloat16) + hidden_states_56 = None + hidden_states_57 = ( + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + * to_21 + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + to_21 + ) = None + qkv_5 = torch._C._nn.linear( + hidden_states_57, + l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_57 = l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_10 = qkv_5[(Ellipsis, slice(None, 3072, None))] + key_states_10 = qkv_5[(Ellipsis, slice(3072, 4096, None))] + value_states_10 = qkv_5[(Ellipsis, slice(4096, None, None))] + qkv_5 = None + view_15 = query_states_10.view((1, 2, -1, 128)) + query_states_10 = None + query_states_11 = view_15.transpose(1, 2) + view_15 = None + view_16 = key_states_10.view((1, 2, -1, 128)) + key_states_10 = None + key_states_11 = view_16.transpose(1, 2) + view_16 = None + view_17 = value_states_10.view((1, 2, -1, 128)) + value_states_10 = None + value_states_11 = view_17.transpose(1, 2) + view_17 = None + cos_5 = l_stack0_0_.unsqueeze(1) + sin_5 = l_stack0_1_.unsqueeze(1) + q_rot_5 = query_states_11[(Ellipsis, slice(None, 96, None))] + q_pass_5 = query_states_11[(Ellipsis, slice(96, None, None))] + query_states_11 = None + k_rot_5 = key_states_11[(Ellipsis, slice(None, 96, None))] + k_pass_5 = key_states_11[(Ellipsis, slice(96, None, None))] + key_states_11 = None + mul_47 = q_rot_5 * cos_5 + x1_10 = q_rot_5[(Ellipsis, slice(None, 48, None))] + x2_10 = q_rot_5[(Ellipsis, slice(48, None, None))] + q_rot_5 = None + neg_10 = -x2_10 + x2_10 = None + cat_20 = torch.cat((neg_10, x1_10), dim=-1) + neg_10 = x1_10 = None + mul_48 = cat_20 * sin_5 + cat_20 = None + add_31 = mul_47 + mul_48 + mul_47 = mul_48 = None + q_embed_5 = torch.cat([add_31, q_pass_5], dim=-1) + add_31 = q_pass_5 = None + mul_49 = k_rot_5 * cos_5 + cos_5 = None + x1_11 = k_rot_5[(Ellipsis, slice(None, 48, None))] + x2_11 = k_rot_5[(Ellipsis, slice(48, None, None))] + k_rot_5 = None + neg_11 = -x2_11 + x2_11 = None + cat_22 = torch.cat((neg_11, x1_11), dim=-1) + neg_11 = x1_11 = None + mul_50 = cat_22 * sin_5 + cat_22 = sin_5 = None + add_32 = mul_49 + mul_50 + mul_49 = mul_50 = None + k_embed_5 = torch.cat([add_32, k_pass_5], dim=-1) + add_32 = k_pass_5 = None + getitem_91 = k_embed_5[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_58 = getitem_91.expand(1, 8, 3, 2, 128) + getitem_91 = None + key_10 = hidden_states_58.reshape(1, 24, 2, 128) + hidden_states_58 = None + getitem_92 = value_states_11[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_59 = getitem_92.expand(1, 8, 3, 2, 128) + getitem_92 = None + value_10 = hidden_states_59.reshape(1, 24, 2, 128) + hidden_states_59 = None + attention_mask_5 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_5 = q_embed_5.contiguous() + q_embed_5 = None + key_11 = key_10.contiguous() + key_10 = None + value_11 = value_10.contiguous() + value_10 = None + attn_output_20 = torch._C._nn.scaled_dot_product_attention( + query_5, + key_11, + value_11, + attn_mask=attention_mask_5, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_5 = key_11 = value_11 = attention_mask_5 = None + transpose_23 = attn_output_20.transpose(1, 2) + attn_output_20 = None + attn_output_21 = transpose_23.contiguous() + transpose_23 = None + reshape_17 = attn_output_21.reshape(1, 2, -1) + attn_output_21 = None + attn_output_22 = reshape_17.contiguous() + reshape_17 = None + attn_output_23 = torch._C._nn.linear( + attn_output_22, + l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_22 = l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_10 = torch.nn.functional.dropout(attn_output_23, 0.0, False, False) + attn_output_23 = None + hidden_states_60 = hidden_states_54 + dropout_10 + hidden_states_54 = dropout_10 = None + hidden_states_61 = hidden_states_60.to(torch.float32) + pow_12 = hidden_states_61.pow(2) + variance_11 = pow_12.mean(-1, keepdim=True) + pow_12 = None + add_34 = variance_11 + 1e-05 + variance_11 = None + rsqrt_11 = torch.rsqrt(add_34) + add_34 = None + hidden_states_62 = hidden_states_61 * rsqrt_11 + hidden_states_61 = rsqrt_11 = None + to_23 = hidden_states_62.to(torch.bfloat16) + hidden_states_62 = None + hidden_states_63 = ( + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + * to_23 + ) + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = ( + to_23 + ) = None + up_states_15 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_63 = l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_5 = up_states_15.chunk(2, dim=-1) + up_states_15 = None + gate_5 = chunk_5[0] + up_states_16 = chunk_5[1] + chunk_5 = None + silu_5 = torch.nn.functional.silu(gate_5, inplace=False) + gate_5 = None + up_states_17 = up_states_16 * silu_5 + up_states_16 = silu_5 = None + hidden_states_64 = torch._C._nn.linear( + up_states_17, + l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_17 = l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_11 = torch.nn.functional.dropout(hidden_states_64, 0.0, False, False) + hidden_states_64 = None + hidden_states_65 = hidden_states_60 + dropout_11 + hidden_states_60 = dropout_11 = None + hidden_states_66 = hidden_states_65.to(torch.float32) + pow_13 = hidden_states_66.pow(2) + variance_12 = pow_13.mean(-1, keepdim=True) + pow_13 = None + add_36 = variance_12 + 1e-05 + variance_12 = None + rsqrt_12 = torch.rsqrt(add_36) + add_36 = None + hidden_states_67 = hidden_states_66 * rsqrt_12 + hidden_states_66 = rsqrt_12 = None + to_25 = hidden_states_67.to(torch.bfloat16) + hidden_states_67 = None + hidden_states_68 = ( + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + * to_25 + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + to_25 + ) = None + qkv_6 = torch._C._nn.linear( + hidden_states_68, + l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_68 = l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_12 = qkv_6[(Ellipsis, slice(None, 3072, None))] + key_states_12 = qkv_6[(Ellipsis, slice(3072, 4096, None))] + value_states_12 = qkv_6[(Ellipsis, slice(4096, None, None))] + qkv_6 = None + view_18 = query_states_12.view((1, 2, -1, 128)) + query_states_12 = None + query_states_13 = view_18.transpose(1, 2) + view_18 = None + view_19 = key_states_12.view((1, 2, -1, 128)) + key_states_12 = None + key_states_13 = view_19.transpose(1, 2) + view_19 = None + view_20 = value_states_12.view((1, 2, -1, 128)) + value_states_12 = None + value_states_13 = view_20.transpose(1, 2) + view_20 = None + cos_6 = l_stack0_0_.unsqueeze(1) + sin_6 = l_stack0_1_.unsqueeze(1) + q_rot_6 = query_states_13[(Ellipsis, slice(None, 96, None))] + q_pass_6 = query_states_13[(Ellipsis, slice(96, None, None))] + query_states_13 = None + k_rot_6 = key_states_13[(Ellipsis, slice(None, 96, None))] + k_pass_6 = key_states_13[(Ellipsis, slice(96, None, None))] + key_states_13 = None + mul_56 = q_rot_6 * cos_6 + x1_12 = q_rot_6[(Ellipsis, slice(None, 48, None))] + x2_12 = q_rot_6[(Ellipsis, slice(48, None, None))] + q_rot_6 = None + neg_12 = -x2_12 + x2_12 = None + cat_24 = torch.cat((neg_12, x1_12), dim=-1) + neg_12 = x1_12 = None + mul_57 = cat_24 * sin_6 + cat_24 = None + add_37 = mul_56 + mul_57 + mul_56 = mul_57 = None + q_embed_6 = torch.cat([add_37, q_pass_6], dim=-1) + add_37 = q_pass_6 = None + mul_58 = k_rot_6 * cos_6 + cos_6 = None + x1_13 = k_rot_6[(Ellipsis, slice(None, 48, None))] + x2_13 = k_rot_6[(Ellipsis, slice(48, None, None))] + k_rot_6 = None + neg_13 = -x2_13 + x2_13 = None + cat_26 = torch.cat((neg_13, x1_13), dim=-1) + neg_13 = x1_13 = None + mul_59 = cat_26 * sin_6 + cat_26 = sin_6 = None + add_38 = mul_58 + mul_59 + mul_58 = mul_59 = None + k_embed_6 = torch.cat([add_38, k_pass_6], dim=-1) + add_38 = k_pass_6 = None + getitem_107 = k_embed_6[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_69 = getitem_107.expand(1, 8, 3, 2, 128) + getitem_107 = None + key_12 = hidden_states_69.reshape(1, 24, 2, 128) + hidden_states_69 = None + getitem_108 = value_states_13[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_70 = getitem_108.expand(1, 8, 3, 2, 128) + getitem_108 = None + value_12 = hidden_states_70.reshape(1, 24, 2, 128) + hidden_states_70 = None + attention_mask_6 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_6 = q_embed_6.contiguous() + q_embed_6 = None + key_13 = key_12.contiguous() + key_12 = None + value_13 = value_12.contiguous() + value_12 = None + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + query_6, + key_13, + value_13, + attn_mask=attention_mask_6, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_6 = key_13 = value_13 = attention_mask_6 = None + transpose_27 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_25 = transpose_27.contiguous() + transpose_27 = None + reshape_20 = attn_output_25.reshape(1, 2, -1) + attn_output_25 = None + attn_output_26 = reshape_20.contiguous() + reshape_20 = None + attn_output_27 = torch._C._nn.linear( + attn_output_26, + l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_26 = l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_12 = torch.nn.functional.dropout(attn_output_27, 0.0, False, False) + attn_output_27 = None + hidden_states_71 = hidden_states_65 + dropout_12 + hidden_states_65 = dropout_12 = None + hidden_states_72 = hidden_states_71.to(torch.float32) + pow_14 = hidden_states_72.pow(2) + variance_13 = pow_14.mean(-1, keepdim=True) + pow_14 = None + add_40 = variance_13 + 1e-05 + variance_13 = None + rsqrt_13 = torch.rsqrt(add_40) + add_40 = None + hidden_states_73 = hidden_states_72 * rsqrt_13 + hidden_states_72 = rsqrt_13 = None + to_27 = hidden_states_73.to(torch.bfloat16) + hidden_states_73 = None + hidden_states_74 = ( + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + * to_27 + ) + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = ( + to_27 + ) = None + up_states_18 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_74 = l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_6 = up_states_18.chunk(2, dim=-1) + up_states_18 = None + gate_6 = chunk_6[0] + up_states_19 = chunk_6[1] + chunk_6 = None + silu_6 = torch.nn.functional.silu(gate_6, inplace=False) + gate_6 = None + up_states_20 = up_states_19 * silu_6 + up_states_19 = silu_6 = None + hidden_states_75 = torch._C._nn.linear( + up_states_20, + l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_20 = l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_13 = torch.nn.functional.dropout(hidden_states_75, 0.0, False, False) + hidden_states_75 = None + hidden_states_76 = hidden_states_71 + dropout_13 + hidden_states_71 = dropout_13 = None + hidden_states_77 = hidden_states_76.to(torch.float32) + pow_15 = hidden_states_77.pow(2) + variance_14 = pow_15.mean(-1, keepdim=True) + pow_15 = None + add_42 = variance_14 + 1e-05 + variance_14 = None + rsqrt_14 = torch.rsqrt(add_42) + add_42 = None + hidden_states_78 = hidden_states_77 * rsqrt_14 + hidden_states_77 = rsqrt_14 = None + to_29 = hidden_states_78.to(torch.bfloat16) + hidden_states_78 = None + hidden_states_79 = ( + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + * to_29 + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + to_29 + ) = None + qkv_7 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_79 = l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_14 = qkv_7[(Ellipsis, slice(None, 3072, None))] + key_states_14 = qkv_7[(Ellipsis, slice(3072, 4096, None))] + value_states_14 = qkv_7[(Ellipsis, slice(4096, None, None))] + qkv_7 = None + view_21 = query_states_14.view((1, 2, -1, 128)) + query_states_14 = None + query_states_15 = view_21.transpose(1, 2) + view_21 = None + view_22 = key_states_14.view((1, 2, -1, 128)) + key_states_14 = None + key_states_15 = view_22.transpose(1, 2) + view_22 = None + view_23 = value_states_14.view((1, 2, -1, 128)) + value_states_14 = None + value_states_15 = view_23.transpose(1, 2) + view_23 = None + cos_7 = l_stack0_0_.unsqueeze(1) + sin_7 = l_stack0_1_.unsqueeze(1) + q_rot_7 = query_states_15[(Ellipsis, slice(None, 96, None))] + q_pass_7 = query_states_15[(Ellipsis, slice(96, None, None))] + query_states_15 = None + k_rot_7 = key_states_15[(Ellipsis, slice(None, 96, None))] + k_pass_7 = key_states_15[(Ellipsis, slice(96, None, None))] + key_states_15 = None + mul_65 = q_rot_7 * cos_7 + x1_14 = q_rot_7[(Ellipsis, slice(None, 48, None))] + x2_14 = q_rot_7[(Ellipsis, slice(48, None, None))] + q_rot_7 = None + neg_14 = -x2_14 + x2_14 = None + cat_28 = torch.cat((neg_14, x1_14), dim=-1) + neg_14 = x1_14 = None + mul_66 = cat_28 * sin_7 + cat_28 = None + add_43 = mul_65 + mul_66 + mul_65 = mul_66 = None + q_embed_7 = torch.cat([add_43, q_pass_7], dim=-1) + add_43 = q_pass_7 = None + mul_67 = k_rot_7 * cos_7 + cos_7 = None + x1_15 = k_rot_7[(Ellipsis, slice(None, 48, None))] + x2_15 = k_rot_7[(Ellipsis, slice(48, None, None))] + k_rot_7 = None + neg_15 = -x2_15 + x2_15 = None + cat_30 = torch.cat((neg_15, x1_15), dim=-1) + neg_15 = x1_15 = None + mul_68 = cat_30 * sin_7 + cat_30 = sin_7 = None + add_44 = mul_67 + mul_68 + mul_67 = mul_68 = None + k_embed_7 = torch.cat([add_44, k_pass_7], dim=-1) + add_44 = k_pass_7 = None + getitem_123 = k_embed_7[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_80 = getitem_123.expand(1, 8, 3, 2, 128) + getitem_123 = None + key_14 = hidden_states_80.reshape(1, 24, 2, 128) + hidden_states_80 = None + getitem_124 = value_states_15[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_81 = getitem_124.expand(1, 8, 3, 2, 128) + getitem_124 = None + value_14 = hidden_states_81.reshape(1, 24, 2, 128) + hidden_states_81 = None + attention_mask_7 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_7 = q_embed_7.contiguous() + q_embed_7 = None + key_15 = key_14.contiguous() + key_14 = None + value_15 = value_14.contiguous() + value_14 = None + attn_output_28 = torch._C._nn.scaled_dot_product_attention( + query_7, + key_15, + value_15, + attn_mask=attention_mask_7, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_7 = key_15 = value_15 = attention_mask_7 = None + transpose_31 = attn_output_28.transpose(1, 2) + attn_output_28 = None + attn_output_29 = transpose_31.contiguous() + transpose_31 = None + reshape_23 = attn_output_29.reshape(1, 2, -1) + attn_output_29 = None + attn_output_30 = reshape_23.contiguous() + reshape_23 = None + attn_output_31 = torch._C._nn.linear( + attn_output_30, + l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_30 = l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_14 = torch.nn.functional.dropout(attn_output_31, 0.0, False, False) + attn_output_31 = None + hidden_states_82 = hidden_states_76 + dropout_14 + hidden_states_76 = dropout_14 = None + hidden_states_83 = hidden_states_82.to(torch.float32) + pow_16 = hidden_states_83.pow(2) + variance_15 = pow_16.mean(-1, keepdim=True) + pow_16 = None + add_46 = variance_15 + 1e-05 + variance_15 = None + rsqrt_15 = torch.rsqrt(add_46) + add_46 = None + hidden_states_84 = hidden_states_83 * rsqrt_15 + hidden_states_83 = rsqrt_15 = None + to_31 = hidden_states_84.to(torch.bfloat16) + hidden_states_84 = None + hidden_states_85 = ( + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + * to_31 + ) + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = ( + to_31 + ) = None + up_states_21 = torch._C._nn.linear( + hidden_states_85, + l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_85 = l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_7 = up_states_21.chunk(2, dim=-1) + up_states_21 = None + gate_7 = chunk_7[0] + up_states_22 = chunk_7[1] + chunk_7 = None + silu_7 = torch.nn.functional.silu(gate_7, inplace=False) + gate_7 = None + up_states_23 = up_states_22 * silu_7 + up_states_22 = silu_7 = None + hidden_states_86 = torch._C._nn.linear( + up_states_23, + l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_23 = l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_15 = torch.nn.functional.dropout(hidden_states_86, 0.0, False, False) + hidden_states_86 = None + hidden_states_87 = hidden_states_82 + dropout_15 + hidden_states_82 = dropout_15 = None + hidden_states_88 = hidden_states_87.to(torch.float32) + pow_17 = hidden_states_88.pow(2) + variance_16 = pow_17.mean(-1, keepdim=True) + pow_17 = None + add_48 = variance_16 + 1e-05 + variance_16 = None + rsqrt_16 = torch.rsqrt(add_48) + add_48 = None + hidden_states_89 = hidden_states_88 * rsqrt_16 + hidden_states_88 = rsqrt_16 = None + to_33 = hidden_states_89.to(torch.bfloat16) + hidden_states_89 = None + hidden_states_90 = ( + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + * to_33 + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + to_33 + ) = None + qkv_8 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_90 = l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_16 = qkv_8[(Ellipsis, slice(None, 3072, None))] + key_states_16 = qkv_8[(Ellipsis, slice(3072, 4096, None))] + value_states_16 = qkv_8[(Ellipsis, slice(4096, None, None))] + qkv_8 = None + view_24 = query_states_16.view((1, 2, -1, 128)) + query_states_16 = None + query_states_17 = view_24.transpose(1, 2) + view_24 = None + view_25 = key_states_16.view((1, 2, -1, 128)) + key_states_16 = None + key_states_17 = view_25.transpose(1, 2) + view_25 = None + view_26 = value_states_16.view((1, 2, -1, 128)) + value_states_16 = None + value_states_17 = view_26.transpose(1, 2) + view_26 = None + cos_8 = l_stack0_0_.unsqueeze(1) + sin_8 = l_stack0_1_.unsqueeze(1) + q_rot_8 = query_states_17[(Ellipsis, slice(None, 96, None))] + q_pass_8 = query_states_17[(Ellipsis, slice(96, None, None))] + query_states_17 = None + k_rot_8 = key_states_17[(Ellipsis, slice(None, 96, None))] + k_pass_8 = key_states_17[(Ellipsis, slice(96, None, None))] + key_states_17 = None + mul_74 = q_rot_8 * cos_8 + x1_16 = q_rot_8[(Ellipsis, slice(None, 48, None))] + x2_16 = q_rot_8[(Ellipsis, slice(48, None, None))] + q_rot_8 = None + neg_16 = -x2_16 + x2_16 = None + cat_32 = torch.cat((neg_16, x1_16), dim=-1) + neg_16 = x1_16 = None + mul_75 = cat_32 * sin_8 + cat_32 = None + add_49 = mul_74 + mul_75 + mul_74 = mul_75 = None + q_embed_8 = torch.cat([add_49, q_pass_8], dim=-1) + add_49 = q_pass_8 = None + mul_76 = k_rot_8 * cos_8 + cos_8 = None + x1_17 = k_rot_8[(Ellipsis, slice(None, 48, None))] + x2_17 = k_rot_8[(Ellipsis, slice(48, None, None))] + k_rot_8 = None + neg_17 = -x2_17 + x2_17 = None + cat_34 = torch.cat((neg_17, x1_17), dim=-1) + neg_17 = x1_17 = None + mul_77 = cat_34 * sin_8 + cat_34 = sin_8 = None + add_50 = mul_76 + mul_77 + mul_76 = mul_77 = None + k_embed_8 = torch.cat([add_50, k_pass_8], dim=-1) + add_50 = k_pass_8 = None + getitem_139 = k_embed_8[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_91 = getitem_139.expand(1, 8, 3, 2, 128) + getitem_139 = None + key_16 = hidden_states_91.reshape(1, 24, 2, 128) + hidden_states_91 = None + getitem_140 = value_states_17[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_92 = getitem_140.expand(1, 8, 3, 2, 128) + getitem_140 = None + value_16 = hidden_states_92.reshape(1, 24, 2, 128) + hidden_states_92 = None + attention_mask_8 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_8 = q_embed_8.contiguous() + q_embed_8 = None + key_17 = key_16.contiguous() + key_16 = None + value_17 = value_16.contiguous() + value_16 = None + attn_output_32 = torch._C._nn.scaled_dot_product_attention( + query_8, + key_17, + value_17, + attn_mask=attention_mask_8, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_8 = key_17 = value_17 = attention_mask_8 = None + transpose_35 = attn_output_32.transpose(1, 2) + attn_output_32 = None + attn_output_33 = transpose_35.contiguous() + transpose_35 = None + reshape_26 = attn_output_33.reshape(1, 2, -1) + attn_output_33 = None + attn_output_34 = reshape_26.contiguous() + reshape_26 = None + attn_output_35 = torch._C._nn.linear( + attn_output_34, + l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_34 = l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_16 = torch.nn.functional.dropout(attn_output_35, 0.0, False, False) + attn_output_35 = None + hidden_states_93 = hidden_states_87 + dropout_16 + hidden_states_87 = dropout_16 = None + hidden_states_94 = hidden_states_93.to(torch.float32) + pow_18 = hidden_states_94.pow(2) + variance_17 = pow_18.mean(-1, keepdim=True) + pow_18 = None + add_52 = variance_17 + 1e-05 + variance_17 = None + rsqrt_17 = torch.rsqrt(add_52) + add_52 = None + hidden_states_95 = hidden_states_94 * rsqrt_17 + hidden_states_94 = rsqrt_17 = None + to_35 = hidden_states_95.to(torch.bfloat16) + hidden_states_95 = None + hidden_states_96 = ( + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + * to_35 + ) + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = ( + to_35 + ) = None + up_states_24 = torch._C._nn.linear( + hidden_states_96, + l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_96 = l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_8 = up_states_24.chunk(2, dim=-1) + up_states_24 = None + gate_8 = chunk_8[0] + up_states_25 = chunk_8[1] + chunk_8 = None + silu_8 = torch.nn.functional.silu(gate_8, inplace=False) + gate_8 = None + up_states_26 = up_states_25 * silu_8 + up_states_25 = silu_8 = None + hidden_states_97 = torch._C._nn.linear( + up_states_26, + l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_26 = l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_17 = torch.nn.functional.dropout(hidden_states_97, 0.0, False, False) + hidden_states_97 = None + hidden_states_98 = hidden_states_93 + dropout_17 + hidden_states_93 = dropout_17 = None + hidden_states_99 = hidden_states_98.to(torch.float32) + pow_19 = hidden_states_99.pow(2) + variance_18 = pow_19.mean(-1, keepdim=True) + pow_19 = None + add_54 = variance_18 + 1e-05 + variance_18 = None + rsqrt_18 = torch.rsqrt(add_54) + add_54 = None + hidden_states_100 = hidden_states_99 * rsqrt_18 + hidden_states_99 = rsqrt_18 = None + to_37 = hidden_states_100.to(torch.bfloat16) + hidden_states_100 = None + hidden_states_101 = ( + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + * to_37 + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + to_37 + ) = None + qkv_9 = torch._C._nn.linear( + hidden_states_101, + l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_101 = l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_18 = qkv_9[(Ellipsis, slice(None, 3072, None))] + key_states_18 = qkv_9[(Ellipsis, slice(3072, 4096, None))] + value_states_18 = qkv_9[(Ellipsis, slice(4096, None, None))] + qkv_9 = None + view_27 = query_states_18.view((1, 2, -1, 128)) + query_states_18 = None + query_states_19 = view_27.transpose(1, 2) + view_27 = None + view_28 = key_states_18.view((1, 2, -1, 128)) + key_states_18 = None + key_states_19 = view_28.transpose(1, 2) + view_28 = None + view_29 = value_states_18.view((1, 2, -1, 128)) + value_states_18 = None + value_states_19 = view_29.transpose(1, 2) + view_29 = None + cos_9 = l_stack0_0_.unsqueeze(1) + sin_9 = l_stack0_1_.unsqueeze(1) + q_rot_9 = query_states_19[(Ellipsis, slice(None, 96, None))] + q_pass_9 = query_states_19[(Ellipsis, slice(96, None, None))] + query_states_19 = None + k_rot_9 = key_states_19[(Ellipsis, slice(None, 96, None))] + k_pass_9 = key_states_19[(Ellipsis, slice(96, None, None))] + key_states_19 = None + mul_83 = q_rot_9 * cos_9 + x1_18 = q_rot_9[(Ellipsis, slice(None, 48, None))] + x2_18 = q_rot_9[(Ellipsis, slice(48, None, None))] + q_rot_9 = None + neg_18 = -x2_18 + x2_18 = None + cat_36 = torch.cat((neg_18, x1_18), dim=-1) + neg_18 = x1_18 = None + mul_84 = cat_36 * sin_9 + cat_36 = None + add_55 = mul_83 + mul_84 + mul_83 = mul_84 = None + q_embed_9 = torch.cat([add_55, q_pass_9], dim=-1) + add_55 = q_pass_9 = None + mul_85 = k_rot_9 * cos_9 + cos_9 = None + x1_19 = k_rot_9[(Ellipsis, slice(None, 48, None))] + x2_19 = k_rot_9[(Ellipsis, slice(48, None, None))] + k_rot_9 = None + neg_19 = -x2_19 + x2_19 = None + cat_38 = torch.cat((neg_19, x1_19), dim=-1) + neg_19 = x1_19 = None + mul_86 = cat_38 * sin_9 + cat_38 = sin_9 = None + add_56 = mul_85 + mul_86 + mul_85 = mul_86 = None + k_embed_9 = torch.cat([add_56, k_pass_9], dim=-1) + add_56 = k_pass_9 = None + getitem_155 = k_embed_9[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_102 = getitem_155.expand(1, 8, 3, 2, 128) + getitem_155 = None + key_18 = hidden_states_102.reshape(1, 24, 2, 128) + hidden_states_102 = None + getitem_156 = value_states_19[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_103 = getitem_156.expand(1, 8, 3, 2, 128) + getitem_156 = None + value_18 = hidden_states_103.reshape(1, 24, 2, 128) + hidden_states_103 = None + attention_mask_9 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_9 = q_embed_9.contiguous() + q_embed_9 = None + key_19 = key_18.contiguous() + key_18 = None + value_19 = value_18.contiguous() + value_18 = None + attn_output_36 = torch._C._nn.scaled_dot_product_attention( + query_9, + key_19, + value_19, + attn_mask=attention_mask_9, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_9 = key_19 = value_19 = attention_mask_9 = None + transpose_39 = attn_output_36.transpose(1, 2) + attn_output_36 = None + attn_output_37 = transpose_39.contiguous() + transpose_39 = None + reshape_29 = attn_output_37.reshape(1, 2, -1) + attn_output_37 = None + attn_output_38 = reshape_29.contiguous() + reshape_29 = None + attn_output_39 = torch._C._nn.linear( + attn_output_38, + l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_38 = l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_18 = torch.nn.functional.dropout(attn_output_39, 0.0, False, False) + attn_output_39 = None + hidden_states_104 = hidden_states_98 + dropout_18 + hidden_states_98 = dropout_18 = None + hidden_states_105 = hidden_states_104.to(torch.float32) + pow_20 = hidden_states_105.pow(2) + variance_19 = pow_20.mean(-1, keepdim=True) + pow_20 = None + add_58 = variance_19 + 1e-05 + variance_19 = None + rsqrt_19 = torch.rsqrt(add_58) + add_58 = None + hidden_states_106 = hidden_states_105 * rsqrt_19 + hidden_states_105 = rsqrt_19 = None + to_39 = hidden_states_106.to(torch.bfloat16) + hidden_states_106 = None + hidden_states_107 = ( + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + * to_39 + ) + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = ( + to_39 + ) = None + up_states_27 = torch._C._nn.linear( + hidden_states_107, + l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_107 = l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_9 = up_states_27.chunk(2, dim=-1) + up_states_27 = None + gate_9 = chunk_9[0] + up_states_28 = chunk_9[1] + chunk_9 = None + silu_9 = torch.nn.functional.silu(gate_9, inplace=False) + gate_9 = None + up_states_29 = up_states_28 * silu_9 + up_states_28 = silu_9 = None + hidden_states_108 = torch._C._nn.linear( + up_states_29, + l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_29 = l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_19 = torch.nn.functional.dropout(hidden_states_108, 0.0, False, False) + hidden_states_108 = None + hidden_states_109 = hidden_states_104 + dropout_19 + hidden_states_104 = dropout_19 = None + hidden_states_110 = hidden_states_109.to(torch.float32) + pow_21 = hidden_states_110.pow(2) + variance_20 = pow_21.mean(-1, keepdim=True) + pow_21 = None + add_60 = variance_20 + 1e-05 + variance_20 = None + rsqrt_20 = torch.rsqrt(add_60) + add_60 = None + hidden_states_111 = hidden_states_110 * rsqrt_20 + hidden_states_110 = rsqrt_20 = None + to_41 = hidden_states_111.to(torch.bfloat16) + hidden_states_111 = None + hidden_states_112 = ( + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + * to_41 + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + to_41 + ) = None + qkv_10 = torch._C._nn.linear( + hidden_states_112, + l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_112 = l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_20 = qkv_10[(Ellipsis, slice(None, 3072, None))] + key_states_20 = qkv_10[(Ellipsis, slice(3072, 4096, None))] + value_states_20 = qkv_10[(Ellipsis, slice(4096, None, None))] + qkv_10 = None + view_30 = query_states_20.view((1, 2, -1, 128)) + query_states_20 = None + query_states_21 = view_30.transpose(1, 2) + view_30 = None + view_31 = key_states_20.view((1, 2, -1, 128)) + key_states_20 = None + key_states_21 = view_31.transpose(1, 2) + view_31 = None + view_32 = value_states_20.view((1, 2, -1, 128)) + value_states_20 = None + value_states_21 = view_32.transpose(1, 2) + view_32 = None + cos_10 = l_stack0_0_.unsqueeze(1) + sin_10 = l_stack0_1_.unsqueeze(1) + q_rot_10 = query_states_21[(Ellipsis, slice(None, 96, None))] + q_pass_10 = query_states_21[(Ellipsis, slice(96, None, None))] + query_states_21 = None + k_rot_10 = key_states_21[(Ellipsis, slice(None, 96, None))] + k_pass_10 = key_states_21[(Ellipsis, slice(96, None, None))] + key_states_21 = None + mul_92 = q_rot_10 * cos_10 + x1_20 = q_rot_10[(Ellipsis, slice(None, 48, None))] + x2_20 = q_rot_10[(Ellipsis, slice(48, None, None))] + q_rot_10 = None + neg_20 = -x2_20 + x2_20 = None + cat_40 = torch.cat((neg_20, x1_20), dim=-1) + neg_20 = x1_20 = None + mul_93 = cat_40 * sin_10 + cat_40 = None + add_61 = mul_92 + mul_93 + mul_92 = mul_93 = None + q_embed_10 = torch.cat([add_61, q_pass_10], dim=-1) + add_61 = q_pass_10 = None + mul_94 = k_rot_10 * cos_10 + cos_10 = None + x1_21 = k_rot_10[(Ellipsis, slice(None, 48, None))] + x2_21 = k_rot_10[(Ellipsis, slice(48, None, None))] + k_rot_10 = None + neg_21 = -x2_21 + x2_21 = None + cat_42 = torch.cat((neg_21, x1_21), dim=-1) + neg_21 = x1_21 = None + mul_95 = cat_42 * sin_10 + cat_42 = sin_10 = None + add_62 = mul_94 + mul_95 + mul_94 = mul_95 = None + k_embed_10 = torch.cat([add_62, k_pass_10], dim=-1) + add_62 = k_pass_10 = None + getitem_171 = k_embed_10[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_113 = getitem_171.expand(1, 8, 3, 2, 128) + getitem_171 = None + key_20 = hidden_states_113.reshape(1, 24, 2, 128) + hidden_states_113 = None + getitem_172 = value_states_21[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_114 = getitem_172.expand(1, 8, 3, 2, 128) + getitem_172 = None + value_20 = hidden_states_114.reshape(1, 24, 2, 128) + hidden_states_114 = None + attention_mask_10 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_10 = q_embed_10.contiguous() + q_embed_10 = None + key_21 = key_20.contiguous() + key_20 = None + value_21 = value_20.contiguous() + value_20 = None + attn_output_40 = torch._C._nn.scaled_dot_product_attention( + query_10, + key_21, + value_21, + attn_mask=attention_mask_10, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_10 = key_21 = value_21 = attention_mask_10 = None + transpose_43 = attn_output_40.transpose(1, 2) + attn_output_40 = None + attn_output_41 = transpose_43.contiguous() + transpose_43 = None + reshape_32 = attn_output_41.reshape(1, 2, -1) + attn_output_41 = None + attn_output_42 = reshape_32.contiguous() + reshape_32 = None + attn_output_43 = torch._C._nn.linear( + attn_output_42, + l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_42 = l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_20 = torch.nn.functional.dropout(attn_output_43, 0.0, False, False) + attn_output_43 = None + hidden_states_115 = hidden_states_109 + dropout_20 + hidden_states_109 = dropout_20 = None + hidden_states_116 = hidden_states_115.to(torch.float32) + pow_22 = hidden_states_116.pow(2) + variance_21 = pow_22.mean(-1, keepdim=True) + pow_22 = None + add_64 = variance_21 + 1e-05 + variance_21 = None + rsqrt_21 = torch.rsqrt(add_64) + add_64 = None + hidden_states_117 = hidden_states_116 * rsqrt_21 + hidden_states_116 = rsqrt_21 = None + to_43 = hidden_states_117.to(torch.bfloat16) + hidden_states_117 = None + hidden_states_118 = ( + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + * to_43 + ) + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = ( + to_43 + ) = None + up_states_30 = torch._C._nn.linear( + hidden_states_118, + l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_118 = l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_10 = up_states_30.chunk(2, dim=-1) + up_states_30 = None + gate_10 = chunk_10[0] + up_states_31 = chunk_10[1] + chunk_10 = None + silu_10 = torch.nn.functional.silu(gate_10, inplace=False) + gate_10 = None + up_states_32 = up_states_31 * silu_10 + up_states_31 = silu_10 = None + hidden_states_119 = torch._C._nn.linear( + up_states_32, + l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_32 = l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_21 = torch.nn.functional.dropout(hidden_states_119, 0.0, False, False) + hidden_states_119 = None + hidden_states_120 = hidden_states_115 + dropout_21 + hidden_states_115 = dropout_21 = None + hidden_states_121 = hidden_states_120.to(torch.float32) + pow_23 = hidden_states_121.pow(2) + variance_22 = pow_23.mean(-1, keepdim=True) + pow_23 = None + add_66 = variance_22 + 1e-05 + variance_22 = None + rsqrt_22 = torch.rsqrt(add_66) + add_66 = None + hidden_states_122 = hidden_states_121 * rsqrt_22 + hidden_states_121 = rsqrt_22 = None + to_45 = hidden_states_122.to(torch.bfloat16) + hidden_states_122 = None + hidden_states_123 = ( + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + * to_45 + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + to_45 + ) = None + qkv_11 = torch._C._nn.linear( + hidden_states_123, + l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_123 = l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_22 = qkv_11[(Ellipsis, slice(None, 3072, None))] + key_states_22 = qkv_11[(Ellipsis, slice(3072, 4096, None))] + value_states_22 = qkv_11[(Ellipsis, slice(4096, None, None))] + qkv_11 = None + view_33 = query_states_22.view((1, 2, -1, 128)) + query_states_22 = None + query_states_23 = view_33.transpose(1, 2) + view_33 = None + view_34 = key_states_22.view((1, 2, -1, 128)) + key_states_22 = None + key_states_23 = view_34.transpose(1, 2) + view_34 = None + view_35 = value_states_22.view((1, 2, -1, 128)) + value_states_22 = None + value_states_23 = view_35.transpose(1, 2) + view_35 = None + cos_11 = l_stack0_0_.unsqueeze(1) + sin_11 = l_stack0_1_.unsqueeze(1) + q_rot_11 = query_states_23[(Ellipsis, slice(None, 96, None))] + q_pass_11 = query_states_23[(Ellipsis, slice(96, None, None))] + query_states_23 = None + k_rot_11 = key_states_23[(Ellipsis, slice(None, 96, None))] + k_pass_11 = key_states_23[(Ellipsis, slice(96, None, None))] + key_states_23 = None + mul_101 = q_rot_11 * cos_11 + x1_22 = q_rot_11[(Ellipsis, slice(None, 48, None))] + x2_22 = q_rot_11[(Ellipsis, slice(48, None, None))] + q_rot_11 = None + neg_22 = -x2_22 + x2_22 = None + cat_44 = torch.cat((neg_22, x1_22), dim=-1) + neg_22 = x1_22 = None + mul_102 = cat_44 * sin_11 + cat_44 = None + add_67 = mul_101 + mul_102 + mul_101 = mul_102 = None + q_embed_11 = torch.cat([add_67, q_pass_11], dim=-1) + add_67 = q_pass_11 = None + mul_103 = k_rot_11 * cos_11 + cos_11 = None + x1_23 = k_rot_11[(Ellipsis, slice(None, 48, None))] + x2_23 = k_rot_11[(Ellipsis, slice(48, None, None))] + k_rot_11 = None + neg_23 = -x2_23 + x2_23 = None + cat_46 = torch.cat((neg_23, x1_23), dim=-1) + neg_23 = x1_23 = None + mul_104 = cat_46 * sin_11 + cat_46 = sin_11 = None + add_68 = mul_103 + mul_104 + mul_103 = mul_104 = None + k_embed_11 = torch.cat([add_68, k_pass_11], dim=-1) + add_68 = k_pass_11 = None + getitem_187 = k_embed_11[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_124 = getitem_187.expand(1, 8, 3, 2, 128) + getitem_187 = None + key_22 = hidden_states_124.reshape(1, 24, 2, 128) + hidden_states_124 = None + getitem_188 = value_states_23[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_125 = getitem_188.expand(1, 8, 3, 2, 128) + getitem_188 = None + value_22 = hidden_states_125.reshape(1, 24, 2, 128) + hidden_states_125 = None + attention_mask_11 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_11 = q_embed_11.contiguous() + q_embed_11 = None + key_23 = key_22.contiguous() + key_22 = None + value_23 = value_22.contiguous() + value_22 = None + attn_output_44 = torch._C._nn.scaled_dot_product_attention( + query_11, + key_23, + value_23, + attn_mask=attention_mask_11, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_11 = key_23 = value_23 = attention_mask_11 = None + transpose_47 = attn_output_44.transpose(1, 2) + attn_output_44 = None + attn_output_45 = transpose_47.contiguous() + transpose_47 = None + reshape_35 = attn_output_45.reshape(1, 2, -1) + attn_output_45 = None + attn_output_46 = reshape_35.contiguous() + reshape_35 = None + attn_output_47 = torch._C._nn.linear( + attn_output_46, + l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_46 = l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_22 = torch.nn.functional.dropout(attn_output_47, 0.0, False, False) + attn_output_47 = None + hidden_states_126 = hidden_states_120 + dropout_22 + hidden_states_120 = dropout_22 = None + hidden_states_127 = hidden_states_126.to(torch.float32) + pow_24 = hidden_states_127.pow(2) + variance_23 = pow_24.mean(-1, keepdim=True) + pow_24 = None + add_70 = variance_23 + 1e-05 + variance_23 = None + rsqrt_23 = torch.rsqrt(add_70) + add_70 = None + hidden_states_128 = hidden_states_127 * rsqrt_23 + hidden_states_127 = rsqrt_23 = None + to_47 = hidden_states_128.to(torch.bfloat16) + hidden_states_128 = None + hidden_states_129 = ( + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + * to_47 + ) + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = ( + to_47 + ) = None + up_states_33 = torch._C._nn.linear( + hidden_states_129, + l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_129 = l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_11 = up_states_33.chunk(2, dim=-1) + up_states_33 = None + gate_11 = chunk_11[0] + up_states_34 = chunk_11[1] + chunk_11 = None + silu_11 = torch.nn.functional.silu(gate_11, inplace=False) + gate_11 = None + up_states_35 = up_states_34 * silu_11 + up_states_34 = silu_11 = None + hidden_states_130 = torch._C._nn.linear( + up_states_35, + l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_35 = l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_23 = torch.nn.functional.dropout(hidden_states_130, 0.0, False, False) + hidden_states_130 = None + hidden_states_131 = hidden_states_126 + dropout_23 + hidden_states_126 = dropout_23 = None + hidden_states_132 = hidden_states_131.to(torch.float32) + pow_25 = hidden_states_132.pow(2) + variance_24 = pow_25.mean(-1, keepdim=True) + pow_25 = None + add_72 = variance_24 + 1e-05 + variance_24 = None + rsqrt_24 = torch.rsqrt(add_72) + add_72 = None + hidden_states_133 = hidden_states_132 * rsqrt_24 + hidden_states_132 = rsqrt_24 = None + to_49 = hidden_states_133.to(torch.bfloat16) + hidden_states_133 = None + hidden_states_134 = ( + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + * to_49 + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + to_49 + ) = None + qkv_12 = torch._C._nn.linear( + hidden_states_134, + l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_134 = l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_24 = qkv_12[(Ellipsis, slice(None, 3072, None))] + key_states_24 = qkv_12[(Ellipsis, slice(3072, 4096, None))] + value_states_24 = qkv_12[(Ellipsis, slice(4096, None, None))] + qkv_12 = None + view_36 = query_states_24.view((1, 2, -1, 128)) + query_states_24 = None + query_states_25 = view_36.transpose(1, 2) + view_36 = None + view_37 = key_states_24.view((1, 2, -1, 128)) + key_states_24 = None + key_states_25 = view_37.transpose(1, 2) + view_37 = None + view_38 = value_states_24.view((1, 2, -1, 128)) + value_states_24 = None + value_states_25 = view_38.transpose(1, 2) + view_38 = None + cos_12 = l_stack0_0_.unsqueeze(1) + sin_12 = l_stack0_1_.unsqueeze(1) + q_rot_12 = query_states_25[(Ellipsis, slice(None, 96, None))] + q_pass_12 = query_states_25[(Ellipsis, slice(96, None, None))] + query_states_25 = None + k_rot_12 = key_states_25[(Ellipsis, slice(None, 96, None))] + k_pass_12 = key_states_25[(Ellipsis, slice(96, None, None))] + key_states_25 = None + mul_110 = q_rot_12 * cos_12 + x1_24 = q_rot_12[(Ellipsis, slice(None, 48, None))] + x2_24 = q_rot_12[(Ellipsis, slice(48, None, None))] + q_rot_12 = None + neg_24 = -x2_24 + x2_24 = None + cat_48 = torch.cat((neg_24, x1_24), dim=-1) + neg_24 = x1_24 = None + mul_111 = cat_48 * sin_12 + cat_48 = None + add_73 = mul_110 + mul_111 + mul_110 = mul_111 = None + q_embed_12 = torch.cat([add_73, q_pass_12], dim=-1) + add_73 = q_pass_12 = None + mul_112 = k_rot_12 * cos_12 + cos_12 = None + x1_25 = k_rot_12[(Ellipsis, slice(None, 48, None))] + x2_25 = k_rot_12[(Ellipsis, slice(48, None, None))] + k_rot_12 = None + neg_25 = -x2_25 + x2_25 = None + cat_50 = torch.cat((neg_25, x1_25), dim=-1) + neg_25 = x1_25 = None + mul_113 = cat_50 * sin_12 + cat_50 = sin_12 = None + add_74 = mul_112 + mul_113 + mul_112 = mul_113 = None + k_embed_12 = torch.cat([add_74, k_pass_12], dim=-1) + add_74 = k_pass_12 = None + getitem_203 = k_embed_12[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_135 = getitem_203.expand(1, 8, 3, 2, 128) + getitem_203 = None + key_24 = hidden_states_135.reshape(1, 24, 2, 128) + hidden_states_135 = None + getitem_204 = value_states_25[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_136 = getitem_204.expand(1, 8, 3, 2, 128) + getitem_204 = None + value_24 = hidden_states_136.reshape(1, 24, 2, 128) + hidden_states_136 = None + attention_mask_12 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_12 = q_embed_12.contiguous() + q_embed_12 = None + key_25 = key_24.contiguous() + key_24 = None + value_25 = value_24.contiguous() + value_24 = None + attn_output_48 = torch._C._nn.scaled_dot_product_attention( + query_12, + key_25, + value_25, + attn_mask=attention_mask_12, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_12 = key_25 = value_25 = attention_mask_12 = None + transpose_51 = attn_output_48.transpose(1, 2) + attn_output_48 = None + attn_output_49 = transpose_51.contiguous() + transpose_51 = None + reshape_38 = attn_output_49.reshape(1, 2, -1) + attn_output_49 = None + attn_output_50 = reshape_38.contiguous() + reshape_38 = None + attn_output_51 = torch._C._nn.linear( + attn_output_50, + l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_50 = l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_24 = torch.nn.functional.dropout(attn_output_51, 0.0, False, False) + attn_output_51 = None + hidden_states_137 = hidden_states_131 + dropout_24 + hidden_states_131 = dropout_24 = None + hidden_states_138 = hidden_states_137.to(torch.float32) + pow_26 = hidden_states_138.pow(2) + variance_25 = pow_26.mean(-1, keepdim=True) + pow_26 = None + add_76 = variance_25 + 1e-05 + variance_25 = None + rsqrt_25 = torch.rsqrt(add_76) + add_76 = None + hidden_states_139 = hidden_states_138 * rsqrt_25 + hidden_states_138 = rsqrt_25 = None + to_51 = hidden_states_139.to(torch.bfloat16) + hidden_states_139 = None + hidden_states_140 = ( + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + * to_51 + ) + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = ( + to_51 + ) = None + up_states_36 = torch._C._nn.linear( + hidden_states_140, + l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_140 = l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_12 = up_states_36.chunk(2, dim=-1) + up_states_36 = None + gate_12 = chunk_12[0] + up_states_37 = chunk_12[1] + chunk_12 = None + silu_12 = torch.nn.functional.silu(gate_12, inplace=False) + gate_12 = None + up_states_38 = up_states_37 * silu_12 + up_states_37 = silu_12 = None + hidden_states_141 = torch._C._nn.linear( + up_states_38, + l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_38 = l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_25 = torch.nn.functional.dropout(hidden_states_141, 0.0, False, False) + hidden_states_141 = None + hidden_states_142 = hidden_states_137 + dropout_25 + hidden_states_137 = dropout_25 = None + hidden_states_143 = hidden_states_142.to(torch.float32) + pow_27 = hidden_states_143.pow(2) + variance_26 = pow_27.mean(-1, keepdim=True) + pow_27 = None + add_78 = variance_26 + 1e-05 + variance_26 = None + rsqrt_26 = torch.rsqrt(add_78) + add_78 = None + hidden_states_144 = hidden_states_143 * rsqrt_26 + hidden_states_143 = rsqrt_26 = None + to_53 = hidden_states_144.to(torch.bfloat16) + hidden_states_144 = None + hidden_states_145 = ( + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + * to_53 + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + to_53 + ) = None + qkv_13 = torch._C._nn.linear( + hidden_states_145, + l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_145 = l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_26 = qkv_13[(Ellipsis, slice(None, 3072, None))] + key_states_26 = qkv_13[(Ellipsis, slice(3072, 4096, None))] + value_states_26 = qkv_13[(Ellipsis, slice(4096, None, None))] + qkv_13 = None + view_39 = query_states_26.view((1, 2, -1, 128)) + query_states_26 = None + query_states_27 = view_39.transpose(1, 2) + view_39 = None + view_40 = key_states_26.view((1, 2, -1, 128)) + key_states_26 = None + key_states_27 = view_40.transpose(1, 2) + view_40 = None + view_41 = value_states_26.view((1, 2, -1, 128)) + value_states_26 = None + value_states_27 = view_41.transpose(1, 2) + view_41 = None + cos_13 = l_stack0_0_.unsqueeze(1) + sin_13 = l_stack0_1_.unsqueeze(1) + q_rot_13 = query_states_27[(Ellipsis, slice(None, 96, None))] + q_pass_13 = query_states_27[(Ellipsis, slice(96, None, None))] + query_states_27 = None + k_rot_13 = key_states_27[(Ellipsis, slice(None, 96, None))] + k_pass_13 = key_states_27[(Ellipsis, slice(96, None, None))] + key_states_27 = None + mul_119 = q_rot_13 * cos_13 + x1_26 = q_rot_13[(Ellipsis, slice(None, 48, None))] + x2_26 = q_rot_13[(Ellipsis, slice(48, None, None))] + q_rot_13 = None + neg_26 = -x2_26 + x2_26 = None + cat_52 = torch.cat((neg_26, x1_26), dim=-1) + neg_26 = x1_26 = None + mul_120 = cat_52 * sin_13 + cat_52 = None + add_79 = mul_119 + mul_120 + mul_119 = mul_120 = None + q_embed_13 = torch.cat([add_79, q_pass_13], dim=-1) + add_79 = q_pass_13 = None + mul_121 = k_rot_13 * cos_13 + cos_13 = None + x1_27 = k_rot_13[(Ellipsis, slice(None, 48, None))] + x2_27 = k_rot_13[(Ellipsis, slice(48, None, None))] + k_rot_13 = None + neg_27 = -x2_27 + x2_27 = None + cat_54 = torch.cat((neg_27, x1_27), dim=-1) + neg_27 = x1_27 = None + mul_122 = cat_54 * sin_13 + cat_54 = sin_13 = None + add_80 = mul_121 + mul_122 + mul_121 = mul_122 = None + k_embed_13 = torch.cat([add_80, k_pass_13], dim=-1) + add_80 = k_pass_13 = None + getitem_219 = k_embed_13[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_146 = getitem_219.expand(1, 8, 3, 2, 128) + getitem_219 = None + key_26 = hidden_states_146.reshape(1, 24, 2, 128) + hidden_states_146 = None + getitem_220 = value_states_27[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_147 = getitem_220.expand(1, 8, 3, 2, 128) + getitem_220 = None + value_26 = hidden_states_147.reshape(1, 24, 2, 128) + hidden_states_147 = None + attention_mask_13 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_13 = q_embed_13.contiguous() + q_embed_13 = None + key_27 = key_26.contiguous() + key_26 = None + value_27 = value_26.contiguous() + value_26 = None + attn_output_52 = torch._C._nn.scaled_dot_product_attention( + query_13, + key_27, + value_27, + attn_mask=attention_mask_13, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_13 = key_27 = value_27 = attention_mask_13 = None + transpose_55 = attn_output_52.transpose(1, 2) + attn_output_52 = None + attn_output_53 = transpose_55.contiguous() + transpose_55 = None + reshape_41 = attn_output_53.reshape(1, 2, -1) + attn_output_53 = None + attn_output_54 = reshape_41.contiguous() + reshape_41 = None + attn_output_55 = torch._C._nn.linear( + attn_output_54, + l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_54 = l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_26 = torch.nn.functional.dropout(attn_output_55, 0.0, False, False) + attn_output_55 = None + hidden_states_148 = hidden_states_142 + dropout_26 + hidden_states_142 = dropout_26 = None + hidden_states_149 = hidden_states_148.to(torch.float32) + pow_28 = hidden_states_149.pow(2) + variance_27 = pow_28.mean(-1, keepdim=True) + pow_28 = None + add_82 = variance_27 + 1e-05 + variance_27 = None + rsqrt_27 = torch.rsqrt(add_82) + add_82 = None + hidden_states_150 = hidden_states_149 * rsqrt_27 + hidden_states_149 = rsqrt_27 = None + to_55 = hidden_states_150.to(torch.bfloat16) + hidden_states_150 = None + hidden_states_151 = ( + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + * to_55 + ) + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = ( + to_55 + ) = None + up_states_39 = torch._C._nn.linear( + hidden_states_151, + l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_151 = l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_13 = up_states_39.chunk(2, dim=-1) + up_states_39 = None + gate_13 = chunk_13[0] + up_states_40 = chunk_13[1] + chunk_13 = None + silu_13 = torch.nn.functional.silu(gate_13, inplace=False) + gate_13 = None + up_states_41 = up_states_40 * silu_13 + up_states_40 = silu_13 = None + hidden_states_152 = torch._C._nn.linear( + up_states_41, + l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_41 = l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_27 = torch.nn.functional.dropout(hidden_states_152, 0.0, False, False) + hidden_states_152 = None + hidden_states_153 = hidden_states_148 + dropout_27 + hidden_states_148 = dropout_27 = None + hidden_states_154 = hidden_states_153.to(torch.float32) + pow_29 = hidden_states_154.pow(2) + variance_28 = pow_29.mean(-1, keepdim=True) + pow_29 = None + add_84 = variance_28 + 1e-05 + variance_28 = None + rsqrt_28 = torch.rsqrt(add_84) + add_84 = None + hidden_states_155 = hidden_states_154 * rsqrt_28 + hidden_states_154 = rsqrt_28 = None + to_57 = hidden_states_155.to(torch.bfloat16) + hidden_states_155 = None + hidden_states_156 = ( + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + * to_57 + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + to_57 + ) = None + qkv_14 = torch._C._nn.linear( + hidden_states_156, + l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_156 = l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_28 = qkv_14[(Ellipsis, slice(None, 3072, None))] + key_states_28 = qkv_14[(Ellipsis, slice(3072, 4096, None))] + value_states_28 = qkv_14[(Ellipsis, slice(4096, None, None))] + qkv_14 = None + view_42 = query_states_28.view((1, 2, -1, 128)) + query_states_28 = None + query_states_29 = view_42.transpose(1, 2) + view_42 = None + view_43 = key_states_28.view((1, 2, -1, 128)) + key_states_28 = None + key_states_29 = view_43.transpose(1, 2) + view_43 = None + view_44 = value_states_28.view((1, 2, -1, 128)) + value_states_28 = None + value_states_29 = view_44.transpose(1, 2) + view_44 = None + cos_14 = l_stack0_0_.unsqueeze(1) + sin_14 = l_stack0_1_.unsqueeze(1) + q_rot_14 = query_states_29[(Ellipsis, slice(None, 96, None))] + q_pass_14 = query_states_29[(Ellipsis, slice(96, None, None))] + query_states_29 = None + k_rot_14 = key_states_29[(Ellipsis, slice(None, 96, None))] + k_pass_14 = key_states_29[(Ellipsis, slice(96, None, None))] + key_states_29 = None + mul_128 = q_rot_14 * cos_14 + x1_28 = q_rot_14[(Ellipsis, slice(None, 48, None))] + x2_28 = q_rot_14[(Ellipsis, slice(48, None, None))] + q_rot_14 = None + neg_28 = -x2_28 + x2_28 = None + cat_56 = torch.cat((neg_28, x1_28), dim=-1) + neg_28 = x1_28 = None + mul_129 = cat_56 * sin_14 + cat_56 = None + add_85 = mul_128 + mul_129 + mul_128 = mul_129 = None + q_embed_14 = torch.cat([add_85, q_pass_14], dim=-1) + add_85 = q_pass_14 = None + mul_130 = k_rot_14 * cos_14 + cos_14 = None + x1_29 = k_rot_14[(Ellipsis, slice(None, 48, None))] + x2_29 = k_rot_14[(Ellipsis, slice(48, None, None))] + k_rot_14 = None + neg_29 = -x2_29 + x2_29 = None + cat_58 = torch.cat((neg_29, x1_29), dim=-1) + neg_29 = x1_29 = None + mul_131 = cat_58 * sin_14 + cat_58 = sin_14 = None + add_86 = mul_130 + mul_131 + mul_130 = mul_131 = None + k_embed_14 = torch.cat([add_86, k_pass_14], dim=-1) + add_86 = k_pass_14 = None + getitem_235 = k_embed_14[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_157 = getitem_235.expand(1, 8, 3, 2, 128) + getitem_235 = None + key_28 = hidden_states_157.reshape(1, 24, 2, 128) + hidden_states_157 = None + getitem_236 = value_states_29[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_158 = getitem_236.expand(1, 8, 3, 2, 128) + getitem_236 = None + value_28 = hidden_states_158.reshape(1, 24, 2, 128) + hidden_states_158 = None + attention_mask_14 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_14 = q_embed_14.contiguous() + q_embed_14 = None + key_29 = key_28.contiguous() + key_28 = None + value_29 = value_28.contiguous() + value_28 = None + attn_output_56 = torch._C._nn.scaled_dot_product_attention( + query_14, + key_29, + value_29, + attn_mask=attention_mask_14, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_14 = key_29 = value_29 = attention_mask_14 = None + transpose_59 = attn_output_56.transpose(1, 2) + attn_output_56 = None + attn_output_57 = transpose_59.contiguous() + transpose_59 = None + reshape_44 = attn_output_57.reshape(1, 2, -1) + attn_output_57 = None + attn_output_58 = reshape_44.contiguous() + reshape_44 = None + attn_output_59 = torch._C._nn.linear( + attn_output_58, + l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_58 = l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_28 = torch.nn.functional.dropout(attn_output_59, 0.0, False, False) + attn_output_59 = None + hidden_states_159 = hidden_states_153 + dropout_28 + hidden_states_153 = dropout_28 = None + hidden_states_160 = hidden_states_159.to(torch.float32) + pow_30 = hidden_states_160.pow(2) + variance_29 = pow_30.mean(-1, keepdim=True) + pow_30 = None + add_88 = variance_29 + 1e-05 + variance_29 = None + rsqrt_29 = torch.rsqrt(add_88) + add_88 = None + hidden_states_161 = hidden_states_160 * rsqrt_29 + hidden_states_160 = rsqrt_29 = None + to_59 = hidden_states_161.to(torch.bfloat16) + hidden_states_161 = None + hidden_states_162 = ( + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + * to_59 + ) + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = ( + to_59 + ) = None + up_states_42 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_162 = l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_14 = up_states_42.chunk(2, dim=-1) + up_states_42 = None + gate_14 = chunk_14[0] + up_states_43 = chunk_14[1] + chunk_14 = None + silu_14 = torch.nn.functional.silu(gate_14, inplace=False) + gate_14 = None + up_states_44 = up_states_43 * silu_14 + up_states_43 = silu_14 = None + hidden_states_163 = torch._C._nn.linear( + up_states_44, + l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_44 = l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_29 = torch.nn.functional.dropout(hidden_states_163, 0.0, False, False) + hidden_states_163 = None + hidden_states_164 = hidden_states_159 + dropout_29 + hidden_states_159 = dropout_29 = None + hidden_states_165 = hidden_states_164.to(torch.float32) + pow_31 = hidden_states_165.pow(2) + variance_30 = pow_31.mean(-1, keepdim=True) + pow_31 = None + add_90 = variance_30 + 1e-05 + variance_30 = None + rsqrt_30 = torch.rsqrt(add_90) + add_90 = None + hidden_states_166 = hidden_states_165 * rsqrt_30 + hidden_states_165 = rsqrt_30 = None + to_61 = hidden_states_166.to(torch.bfloat16) + hidden_states_166 = None + hidden_states_167 = ( + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + * to_61 + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + to_61 + ) = None + qkv_15 = torch._C._nn.linear( + hidden_states_167, + l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_167 = l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_30 = qkv_15[(Ellipsis, slice(None, 3072, None))] + key_states_30 = qkv_15[(Ellipsis, slice(3072, 4096, None))] + value_states_30 = qkv_15[(Ellipsis, slice(4096, None, None))] + qkv_15 = None + view_45 = query_states_30.view((1, 2, -1, 128)) + query_states_30 = None + query_states_31 = view_45.transpose(1, 2) + view_45 = None + view_46 = key_states_30.view((1, 2, -1, 128)) + key_states_30 = None + key_states_31 = view_46.transpose(1, 2) + view_46 = None + view_47 = value_states_30.view((1, 2, -1, 128)) + value_states_30 = None + value_states_31 = view_47.transpose(1, 2) + view_47 = None + cos_15 = l_stack0_0_.unsqueeze(1) + sin_15 = l_stack0_1_.unsqueeze(1) + q_rot_15 = query_states_31[(Ellipsis, slice(None, 96, None))] + q_pass_15 = query_states_31[(Ellipsis, slice(96, None, None))] + query_states_31 = None + k_rot_15 = key_states_31[(Ellipsis, slice(None, 96, None))] + k_pass_15 = key_states_31[(Ellipsis, slice(96, None, None))] + key_states_31 = None + mul_137 = q_rot_15 * cos_15 + x1_30 = q_rot_15[(Ellipsis, slice(None, 48, None))] + x2_30 = q_rot_15[(Ellipsis, slice(48, None, None))] + q_rot_15 = None + neg_30 = -x2_30 + x2_30 = None + cat_60 = torch.cat((neg_30, x1_30), dim=-1) + neg_30 = x1_30 = None + mul_138 = cat_60 * sin_15 + cat_60 = None + add_91 = mul_137 + mul_138 + mul_137 = mul_138 = None + q_embed_15 = torch.cat([add_91, q_pass_15], dim=-1) + add_91 = q_pass_15 = None + mul_139 = k_rot_15 * cos_15 + cos_15 = None + x1_31 = k_rot_15[(Ellipsis, slice(None, 48, None))] + x2_31 = k_rot_15[(Ellipsis, slice(48, None, None))] + k_rot_15 = None + neg_31 = -x2_31 + x2_31 = None + cat_62 = torch.cat((neg_31, x1_31), dim=-1) + neg_31 = x1_31 = None + mul_140 = cat_62 * sin_15 + cat_62 = sin_15 = None + add_92 = mul_139 + mul_140 + mul_139 = mul_140 = None + k_embed_15 = torch.cat([add_92, k_pass_15], dim=-1) + add_92 = k_pass_15 = None + getitem_251 = k_embed_15[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_168 = getitem_251.expand(1, 8, 3, 2, 128) + getitem_251 = None + key_30 = hidden_states_168.reshape(1, 24, 2, 128) + hidden_states_168 = None + getitem_252 = value_states_31[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_169 = getitem_252.expand(1, 8, 3, 2, 128) + getitem_252 = None + value_30 = hidden_states_169.reshape(1, 24, 2, 128) + hidden_states_169 = None + attention_mask_15 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_15 = q_embed_15.contiguous() + q_embed_15 = None + key_31 = key_30.contiguous() + key_30 = None + value_31 = value_30.contiguous() + value_30 = None + attn_output_60 = torch._C._nn.scaled_dot_product_attention( + query_15, + key_31, + value_31, + attn_mask=attention_mask_15, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_15 = key_31 = value_31 = attention_mask_15 = None + transpose_63 = attn_output_60.transpose(1, 2) + attn_output_60 = None + attn_output_61 = transpose_63.contiguous() + transpose_63 = None + reshape_47 = attn_output_61.reshape(1, 2, -1) + attn_output_61 = None + attn_output_62 = reshape_47.contiguous() + reshape_47 = None + attn_output_63 = torch._C._nn.linear( + attn_output_62, + l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_62 = l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_30 = torch.nn.functional.dropout(attn_output_63, 0.0, False, False) + attn_output_63 = None + hidden_states_170 = hidden_states_164 + dropout_30 + hidden_states_164 = dropout_30 = None + hidden_states_171 = hidden_states_170.to(torch.float32) + pow_32 = hidden_states_171.pow(2) + variance_31 = pow_32.mean(-1, keepdim=True) + pow_32 = None + add_94 = variance_31 + 1e-05 + variance_31 = None + rsqrt_31 = torch.rsqrt(add_94) + add_94 = None + hidden_states_172 = hidden_states_171 * rsqrt_31 + hidden_states_171 = rsqrt_31 = None + to_63 = hidden_states_172.to(torch.bfloat16) + hidden_states_172 = None + hidden_states_173 = ( + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + * to_63 + ) + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = ( + to_63 + ) = None + up_states_45 = torch._C._nn.linear( + hidden_states_173, + l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_173 = l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_15 = up_states_45.chunk(2, dim=-1) + up_states_45 = None + gate_15 = chunk_15[0] + up_states_46 = chunk_15[1] + chunk_15 = None + silu_15 = torch.nn.functional.silu(gate_15, inplace=False) + gate_15 = None + up_states_47 = up_states_46 * silu_15 + up_states_46 = silu_15 = None + hidden_states_174 = torch._C._nn.linear( + up_states_47, + l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_47 = l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_31 = torch.nn.functional.dropout(hidden_states_174, 0.0, False, False) + hidden_states_174 = None + hidden_states_175 = hidden_states_170 + dropout_31 + hidden_states_170 = dropout_31 = None + hidden_states_176 = hidden_states_175.to(torch.float32) + pow_33 = hidden_states_176.pow(2) + variance_32 = pow_33.mean(-1, keepdim=True) + pow_33 = None + add_96 = variance_32 + 1e-05 + variance_32 = None + rsqrt_32 = torch.rsqrt(add_96) + add_96 = None + hidden_states_177 = hidden_states_176 * rsqrt_32 + hidden_states_176 = rsqrt_32 = None + to_65 = hidden_states_177.to(torch.bfloat16) + hidden_states_177 = None + hidden_states_178 = ( + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + * to_65 + ) + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + to_65 + ) = None + qkv_16 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_178 = l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_32 = qkv_16[(Ellipsis, slice(None, 3072, None))] + key_states_32 = qkv_16[(Ellipsis, slice(3072, 4096, None))] + value_states_32 = qkv_16[(Ellipsis, slice(4096, None, None))] + qkv_16 = None + view_48 = query_states_32.view((1, 2, -1, 128)) + query_states_32 = None + query_states_33 = view_48.transpose(1, 2) + view_48 = None + view_49 = key_states_32.view((1, 2, -1, 128)) + key_states_32 = None + key_states_33 = view_49.transpose(1, 2) + view_49 = None + view_50 = value_states_32.view((1, 2, -1, 128)) + value_states_32 = None + value_states_33 = view_50.transpose(1, 2) + view_50 = None + cos_16 = l_stack0_0_.unsqueeze(1) + sin_16 = l_stack0_1_.unsqueeze(1) + q_rot_16 = query_states_33[(Ellipsis, slice(None, 96, None))] + q_pass_16 = query_states_33[(Ellipsis, slice(96, None, None))] + query_states_33 = None + k_rot_16 = key_states_33[(Ellipsis, slice(None, 96, None))] + k_pass_16 = key_states_33[(Ellipsis, slice(96, None, None))] + key_states_33 = None + mul_146 = q_rot_16 * cos_16 + x1_32 = q_rot_16[(Ellipsis, slice(None, 48, None))] + x2_32 = q_rot_16[(Ellipsis, slice(48, None, None))] + q_rot_16 = None + neg_32 = -x2_32 + x2_32 = None + cat_64 = torch.cat((neg_32, x1_32), dim=-1) + neg_32 = x1_32 = None + mul_147 = cat_64 * sin_16 + cat_64 = None + add_97 = mul_146 + mul_147 + mul_146 = mul_147 = None + q_embed_16 = torch.cat([add_97, q_pass_16], dim=-1) + add_97 = q_pass_16 = None + mul_148 = k_rot_16 * cos_16 + cos_16 = None + x1_33 = k_rot_16[(Ellipsis, slice(None, 48, None))] + x2_33 = k_rot_16[(Ellipsis, slice(48, None, None))] + k_rot_16 = None + neg_33 = -x2_33 + x2_33 = None + cat_66 = torch.cat((neg_33, x1_33), dim=-1) + neg_33 = x1_33 = None + mul_149 = cat_66 * sin_16 + cat_66 = sin_16 = None + add_98 = mul_148 + mul_149 + mul_148 = mul_149 = None + k_embed_16 = torch.cat([add_98, k_pass_16], dim=-1) + add_98 = k_pass_16 = None + getitem_267 = k_embed_16[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_179 = getitem_267.expand(1, 8, 3, 2, 128) + getitem_267 = None + key_32 = hidden_states_179.reshape(1, 24, 2, 128) + hidden_states_179 = None + getitem_268 = value_states_33[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_180 = getitem_268.expand(1, 8, 3, 2, 128) + getitem_268 = None + value_32 = hidden_states_180.reshape(1, 24, 2, 128) + hidden_states_180 = None + attention_mask_16 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_16 = q_embed_16.contiguous() + q_embed_16 = None + key_33 = key_32.contiguous() + key_32 = None + value_33 = value_32.contiguous() + value_32 = None + attn_output_64 = torch._C._nn.scaled_dot_product_attention( + query_16, + key_33, + value_33, + attn_mask=attention_mask_16, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_16 = key_33 = value_33 = attention_mask_16 = None + transpose_67 = attn_output_64.transpose(1, 2) + attn_output_64 = None + attn_output_65 = transpose_67.contiguous() + transpose_67 = None + reshape_50 = attn_output_65.reshape(1, 2, -1) + attn_output_65 = None + attn_output_66 = reshape_50.contiguous() + reshape_50 = None + attn_output_67 = torch._C._nn.linear( + attn_output_66, + l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_66 = l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_32 = torch.nn.functional.dropout(attn_output_67, 0.0, False, False) + attn_output_67 = None + hidden_states_181 = hidden_states_175 + dropout_32 + hidden_states_175 = dropout_32 = None + hidden_states_182 = hidden_states_181.to(torch.float32) + pow_34 = hidden_states_182.pow(2) + variance_33 = pow_34.mean(-1, keepdim=True) + pow_34 = None + add_100 = variance_33 + 1e-05 + variance_33 = None + rsqrt_33 = torch.rsqrt(add_100) + add_100 = None + hidden_states_183 = hidden_states_182 * rsqrt_33 + hidden_states_182 = rsqrt_33 = None + to_67 = hidden_states_183.to(torch.bfloat16) + hidden_states_183 = None + hidden_states_184 = ( + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + * to_67 + ) + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = ( + to_67 + ) = None + up_states_48 = torch._C._nn.linear( + hidden_states_184, + l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_184 = l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_16 = up_states_48.chunk(2, dim=-1) + up_states_48 = None + gate_16 = chunk_16[0] + up_states_49 = chunk_16[1] + chunk_16 = None + silu_16 = torch.nn.functional.silu(gate_16, inplace=False) + gate_16 = None + up_states_50 = up_states_49 * silu_16 + up_states_49 = silu_16 = None + hidden_states_185 = torch._C._nn.linear( + up_states_50, + l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_50 = l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_33 = torch.nn.functional.dropout(hidden_states_185, 0.0, False, False) + hidden_states_185 = None + hidden_states_186 = hidden_states_181 + dropout_33 + hidden_states_181 = dropout_33 = None + hidden_states_187 = hidden_states_186.to(torch.float32) + pow_35 = hidden_states_187.pow(2) + variance_34 = pow_35.mean(-1, keepdim=True) + pow_35 = None + add_102 = variance_34 + 1e-05 + variance_34 = None + rsqrt_34 = torch.rsqrt(add_102) + add_102 = None + hidden_states_188 = hidden_states_187 * rsqrt_34 + hidden_states_187 = rsqrt_34 = None + to_69 = hidden_states_188.to(torch.bfloat16) + hidden_states_188 = None + hidden_states_189 = ( + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + * to_69 + ) + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + to_69 + ) = None + qkv_17 = torch._C._nn.linear( + hidden_states_189, + l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_189 = l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_34 = qkv_17[(Ellipsis, slice(None, 3072, None))] + key_states_34 = qkv_17[(Ellipsis, slice(3072, 4096, None))] + value_states_34 = qkv_17[(Ellipsis, slice(4096, None, None))] + qkv_17 = None + view_51 = query_states_34.view((1, 2, -1, 128)) + query_states_34 = None + query_states_35 = view_51.transpose(1, 2) + view_51 = None + view_52 = key_states_34.view((1, 2, -1, 128)) + key_states_34 = None + key_states_35 = view_52.transpose(1, 2) + view_52 = None + view_53 = value_states_34.view((1, 2, -1, 128)) + value_states_34 = None + value_states_35 = view_53.transpose(1, 2) + view_53 = None + cos_17 = l_stack0_0_.unsqueeze(1) + sin_17 = l_stack0_1_.unsqueeze(1) + q_rot_17 = query_states_35[(Ellipsis, slice(None, 96, None))] + q_pass_17 = query_states_35[(Ellipsis, slice(96, None, None))] + query_states_35 = None + k_rot_17 = key_states_35[(Ellipsis, slice(None, 96, None))] + k_pass_17 = key_states_35[(Ellipsis, slice(96, None, None))] + key_states_35 = None + mul_155 = q_rot_17 * cos_17 + x1_34 = q_rot_17[(Ellipsis, slice(None, 48, None))] + x2_34 = q_rot_17[(Ellipsis, slice(48, None, None))] + q_rot_17 = None + neg_34 = -x2_34 + x2_34 = None + cat_68 = torch.cat((neg_34, x1_34), dim=-1) + neg_34 = x1_34 = None + mul_156 = cat_68 * sin_17 + cat_68 = None + add_103 = mul_155 + mul_156 + mul_155 = mul_156 = None + q_embed_17 = torch.cat([add_103, q_pass_17], dim=-1) + add_103 = q_pass_17 = None + mul_157 = k_rot_17 * cos_17 + cos_17 = None + x1_35 = k_rot_17[(Ellipsis, slice(None, 48, None))] + x2_35 = k_rot_17[(Ellipsis, slice(48, None, None))] + k_rot_17 = None + neg_35 = -x2_35 + x2_35 = None + cat_70 = torch.cat((neg_35, x1_35), dim=-1) + neg_35 = x1_35 = None + mul_158 = cat_70 * sin_17 + cat_70 = sin_17 = None + add_104 = mul_157 + mul_158 + mul_157 = mul_158 = None + k_embed_17 = torch.cat([add_104, k_pass_17], dim=-1) + add_104 = k_pass_17 = None + getitem_283 = k_embed_17[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_190 = getitem_283.expand(1, 8, 3, 2, 128) + getitem_283 = None + key_34 = hidden_states_190.reshape(1, 24, 2, 128) + hidden_states_190 = None + getitem_284 = value_states_35[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_191 = getitem_284.expand(1, 8, 3, 2, 128) + getitem_284 = None + value_34 = hidden_states_191.reshape(1, 24, 2, 128) + hidden_states_191 = None + attention_mask_17 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_17 = q_embed_17.contiguous() + q_embed_17 = None + key_35 = key_34.contiguous() + key_34 = None + value_35 = value_34.contiguous() + value_34 = None + attn_output_68 = torch._C._nn.scaled_dot_product_attention( + query_17, + key_35, + value_35, + attn_mask=attention_mask_17, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_17 = key_35 = value_35 = attention_mask_17 = None + transpose_71 = attn_output_68.transpose(1, 2) + attn_output_68 = None + attn_output_69 = transpose_71.contiguous() + transpose_71 = None + reshape_53 = attn_output_69.reshape(1, 2, -1) + attn_output_69 = None + attn_output_70 = reshape_53.contiguous() + reshape_53 = None + attn_output_71 = torch._C._nn.linear( + attn_output_70, + l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_70 = l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_34 = torch.nn.functional.dropout(attn_output_71, 0.0, False, False) + attn_output_71 = None + hidden_states_192 = hidden_states_186 + dropout_34 + hidden_states_186 = dropout_34 = None + hidden_states_193 = hidden_states_192.to(torch.float32) + pow_36 = hidden_states_193.pow(2) + variance_35 = pow_36.mean(-1, keepdim=True) + pow_36 = None + add_106 = variance_35 + 1e-05 + variance_35 = None + rsqrt_35 = torch.rsqrt(add_106) + add_106 = None + hidden_states_194 = hidden_states_193 * rsqrt_35 + hidden_states_193 = rsqrt_35 = None + to_71 = hidden_states_194.to(torch.bfloat16) + hidden_states_194 = None + hidden_states_195 = ( + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + * to_71 + ) + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = ( + to_71 + ) = None + up_states_51 = torch._C._nn.linear( + hidden_states_195, + l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_195 = l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_17 = up_states_51.chunk(2, dim=-1) + up_states_51 = None + gate_17 = chunk_17[0] + up_states_52 = chunk_17[1] + chunk_17 = None + silu_17 = torch.nn.functional.silu(gate_17, inplace=False) + gate_17 = None + up_states_53 = up_states_52 * silu_17 + up_states_52 = silu_17 = None + hidden_states_196 = torch._C._nn.linear( + up_states_53, + l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_53 = l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_35 = torch.nn.functional.dropout(hidden_states_196, 0.0, False, False) + hidden_states_196 = None + hidden_states_197 = hidden_states_192 + dropout_35 + hidden_states_192 = dropout_35 = None + hidden_states_198 = hidden_states_197.to(torch.float32) + pow_37 = hidden_states_198.pow(2) + variance_36 = pow_37.mean(-1, keepdim=True) + pow_37 = None + add_108 = variance_36 + 1e-05 + variance_36 = None + rsqrt_36 = torch.rsqrt(add_108) + add_108 = None + hidden_states_199 = hidden_states_198 * rsqrt_36 + hidden_states_198 = rsqrt_36 = None + to_73 = hidden_states_199.to(torch.bfloat16) + hidden_states_199 = None + hidden_states_200 = ( + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + * to_73 + ) + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + to_73 + ) = None + qkv_18 = torch._C._nn.linear( + hidden_states_200, + l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_200 = l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_36 = qkv_18[(Ellipsis, slice(None, 3072, None))] + key_states_36 = qkv_18[(Ellipsis, slice(3072, 4096, None))] + value_states_36 = qkv_18[(Ellipsis, slice(4096, None, None))] + qkv_18 = None + view_54 = query_states_36.view((1, 2, -1, 128)) + query_states_36 = None + query_states_37 = view_54.transpose(1, 2) + view_54 = None + view_55 = key_states_36.view((1, 2, -1, 128)) + key_states_36 = None + key_states_37 = view_55.transpose(1, 2) + view_55 = None + view_56 = value_states_36.view((1, 2, -1, 128)) + value_states_36 = None + value_states_37 = view_56.transpose(1, 2) + view_56 = None + cos_18 = l_stack0_0_.unsqueeze(1) + sin_18 = l_stack0_1_.unsqueeze(1) + q_rot_18 = query_states_37[(Ellipsis, slice(None, 96, None))] + q_pass_18 = query_states_37[(Ellipsis, slice(96, None, None))] + query_states_37 = None + k_rot_18 = key_states_37[(Ellipsis, slice(None, 96, None))] + k_pass_18 = key_states_37[(Ellipsis, slice(96, None, None))] + key_states_37 = None + mul_164 = q_rot_18 * cos_18 + x1_36 = q_rot_18[(Ellipsis, slice(None, 48, None))] + x2_36 = q_rot_18[(Ellipsis, slice(48, None, None))] + q_rot_18 = None + neg_36 = -x2_36 + x2_36 = None + cat_72 = torch.cat((neg_36, x1_36), dim=-1) + neg_36 = x1_36 = None + mul_165 = cat_72 * sin_18 + cat_72 = None + add_109 = mul_164 + mul_165 + mul_164 = mul_165 = None + q_embed_18 = torch.cat([add_109, q_pass_18], dim=-1) + add_109 = q_pass_18 = None + mul_166 = k_rot_18 * cos_18 + cos_18 = None + x1_37 = k_rot_18[(Ellipsis, slice(None, 48, None))] + x2_37 = k_rot_18[(Ellipsis, slice(48, None, None))] + k_rot_18 = None + neg_37 = -x2_37 + x2_37 = None + cat_74 = torch.cat((neg_37, x1_37), dim=-1) + neg_37 = x1_37 = None + mul_167 = cat_74 * sin_18 + cat_74 = sin_18 = None + add_110 = mul_166 + mul_167 + mul_166 = mul_167 = None + k_embed_18 = torch.cat([add_110, k_pass_18], dim=-1) + add_110 = k_pass_18 = None + getitem_299 = k_embed_18[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_201 = getitem_299.expand(1, 8, 3, 2, 128) + getitem_299 = None + key_36 = hidden_states_201.reshape(1, 24, 2, 128) + hidden_states_201 = None + getitem_300 = value_states_37[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_202 = getitem_300.expand(1, 8, 3, 2, 128) + getitem_300 = None + value_36 = hidden_states_202.reshape(1, 24, 2, 128) + hidden_states_202 = None + attention_mask_18 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_18 = q_embed_18.contiguous() + q_embed_18 = None + key_37 = key_36.contiguous() + key_36 = None + value_37 = value_36.contiguous() + value_36 = None + attn_output_72 = torch._C._nn.scaled_dot_product_attention( + query_18, + key_37, + value_37, + attn_mask=attention_mask_18, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_18 = key_37 = value_37 = attention_mask_18 = None + transpose_75 = attn_output_72.transpose(1, 2) + attn_output_72 = None + attn_output_73 = transpose_75.contiguous() + transpose_75 = None + reshape_56 = attn_output_73.reshape(1, 2, -1) + attn_output_73 = None + attn_output_74 = reshape_56.contiguous() + reshape_56 = None + attn_output_75 = torch._C._nn.linear( + attn_output_74, + l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_74 = l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_36 = torch.nn.functional.dropout(attn_output_75, 0.0, False, False) + attn_output_75 = None + hidden_states_203 = hidden_states_197 + dropout_36 + hidden_states_197 = dropout_36 = None + hidden_states_204 = hidden_states_203.to(torch.float32) + pow_38 = hidden_states_204.pow(2) + variance_37 = pow_38.mean(-1, keepdim=True) + pow_38 = None + add_112 = variance_37 + 1e-05 + variance_37 = None + rsqrt_37 = torch.rsqrt(add_112) + add_112 = None + hidden_states_205 = hidden_states_204 * rsqrt_37 + hidden_states_204 = rsqrt_37 = None + to_75 = hidden_states_205.to(torch.bfloat16) + hidden_states_205 = None + hidden_states_206 = ( + l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + * to_75 + ) + l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = ( + to_75 + ) = None + up_states_54 = torch._C._nn.linear( + hidden_states_206, + l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_206 = l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_18 = up_states_54.chunk(2, dim=-1) + up_states_54 = None + gate_18 = chunk_18[0] + up_states_55 = chunk_18[1] + chunk_18 = None + silu_18 = torch.nn.functional.silu(gate_18, inplace=False) + gate_18 = None + up_states_56 = up_states_55 * silu_18 + up_states_55 = silu_18 = None + hidden_states_207 = torch._C._nn.linear( + up_states_56, + l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_56 = l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_37 = torch.nn.functional.dropout(hidden_states_207, 0.0, False, False) + hidden_states_207 = None + hidden_states_208 = hidden_states_203 + dropout_37 + hidden_states_203 = dropout_37 = None + hidden_states_209 = hidden_states_208.to(torch.float32) + pow_39 = hidden_states_209.pow(2) + variance_38 = pow_39.mean(-1, keepdim=True) + pow_39 = None + add_114 = variance_38 + 1e-05 + variance_38 = None + rsqrt_38 = torch.rsqrt(add_114) + add_114 = None + hidden_states_210 = hidden_states_209 * rsqrt_38 + hidden_states_209 = rsqrt_38 = None + to_77 = hidden_states_210.to(torch.bfloat16) + hidden_states_210 = None + hidden_states_211 = ( + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + * to_77 + ) + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + to_77 + ) = None + qkv_19 = torch._C._nn.linear( + hidden_states_211, + l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_211 = l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_38 = qkv_19[(Ellipsis, slice(None, 3072, None))] + key_states_38 = qkv_19[(Ellipsis, slice(3072, 4096, None))] + value_states_38 = qkv_19[(Ellipsis, slice(4096, None, None))] + qkv_19 = None + view_57 = query_states_38.view((1, 2, -1, 128)) + query_states_38 = None + query_states_39 = view_57.transpose(1, 2) + view_57 = None + view_58 = key_states_38.view((1, 2, -1, 128)) + key_states_38 = None + key_states_39 = view_58.transpose(1, 2) + view_58 = None + view_59 = value_states_38.view((1, 2, -1, 128)) + value_states_38 = None + value_states_39 = view_59.transpose(1, 2) + view_59 = None + cos_19 = l_stack0_0_.unsqueeze(1) + sin_19 = l_stack0_1_.unsqueeze(1) + q_rot_19 = query_states_39[(Ellipsis, slice(None, 96, None))] + q_pass_19 = query_states_39[(Ellipsis, slice(96, None, None))] + query_states_39 = None + k_rot_19 = key_states_39[(Ellipsis, slice(None, 96, None))] + k_pass_19 = key_states_39[(Ellipsis, slice(96, None, None))] + key_states_39 = None + mul_173 = q_rot_19 * cos_19 + x1_38 = q_rot_19[(Ellipsis, slice(None, 48, None))] + x2_38 = q_rot_19[(Ellipsis, slice(48, None, None))] + q_rot_19 = None + neg_38 = -x2_38 + x2_38 = None + cat_76 = torch.cat((neg_38, x1_38), dim=-1) + neg_38 = x1_38 = None + mul_174 = cat_76 * sin_19 + cat_76 = None + add_115 = mul_173 + mul_174 + mul_173 = mul_174 = None + q_embed_19 = torch.cat([add_115, q_pass_19], dim=-1) + add_115 = q_pass_19 = None + mul_175 = k_rot_19 * cos_19 + cos_19 = None + x1_39 = k_rot_19[(Ellipsis, slice(None, 48, None))] + x2_39 = k_rot_19[(Ellipsis, slice(48, None, None))] + k_rot_19 = None + neg_39 = -x2_39 + x2_39 = None + cat_78 = torch.cat((neg_39, x1_39), dim=-1) + neg_39 = x1_39 = None + mul_176 = cat_78 * sin_19 + cat_78 = sin_19 = None + add_116 = mul_175 + mul_176 + mul_175 = mul_176 = None + k_embed_19 = torch.cat([add_116, k_pass_19], dim=-1) + add_116 = k_pass_19 = None + getitem_315 = k_embed_19[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_212 = getitem_315.expand(1, 8, 3, 2, 128) + getitem_315 = None + key_38 = hidden_states_212.reshape(1, 24, 2, 128) + hidden_states_212 = None + getitem_316 = value_states_39[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_213 = getitem_316.expand(1, 8, 3, 2, 128) + getitem_316 = None + value_38 = hidden_states_213.reshape(1, 24, 2, 128) + hidden_states_213 = None + attention_mask_19 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_19 = q_embed_19.contiguous() + q_embed_19 = None + key_39 = key_38.contiguous() + key_38 = None + value_39 = value_38.contiguous() + value_38 = None + attn_output_76 = torch._C._nn.scaled_dot_product_attention( + query_19, + key_39, + value_39, + attn_mask=attention_mask_19, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_19 = key_39 = value_39 = attention_mask_19 = None + transpose_79 = attn_output_76.transpose(1, 2) + attn_output_76 = None + attn_output_77 = transpose_79.contiguous() + transpose_79 = None + reshape_59 = attn_output_77.reshape(1, 2, -1) + attn_output_77 = None + attn_output_78 = reshape_59.contiguous() + reshape_59 = None + attn_output_79 = torch._C._nn.linear( + attn_output_78, + l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_78 = l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_38 = torch.nn.functional.dropout(attn_output_79, 0.0, False, False) + attn_output_79 = None + hidden_states_214 = hidden_states_208 + dropout_38 + hidden_states_208 = dropout_38 = None + hidden_states_215 = hidden_states_214.to(torch.float32) + pow_40 = hidden_states_215.pow(2) + variance_39 = pow_40.mean(-1, keepdim=True) + pow_40 = None + add_118 = variance_39 + 1e-05 + variance_39 = None + rsqrt_39 = torch.rsqrt(add_118) + add_118 = None + hidden_states_216 = hidden_states_215 * rsqrt_39 + hidden_states_215 = rsqrt_39 = None + to_79 = hidden_states_216.to(torch.bfloat16) + hidden_states_216 = None + hidden_states_217 = ( + l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + * to_79 + ) + l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = ( + to_79 + ) = None + up_states_57 = torch._C._nn.linear( + hidden_states_217, + l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_217 = l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_19 = up_states_57.chunk(2, dim=-1) + up_states_57 = None + gate_19 = chunk_19[0] + up_states_58 = chunk_19[1] + chunk_19 = None + silu_19 = torch.nn.functional.silu(gate_19, inplace=False) + gate_19 = None + up_states_59 = up_states_58 * silu_19 + up_states_58 = silu_19 = None + hidden_states_218 = torch._C._nn.linear( + up_states_59, + l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_59 = l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_39 = torch.nn.functional.dropout(hidden_states_218, 0.0, False, False) + hidden_states_218 = None + hidden_states_219 = hidden_states_214 + dropout_39 + hidden_states_214 = dropout_39 = None + hidden_states_220 = hidden_states_219.to(torch.float32) + pow_41 = hidden_states_220.pow(2) + variance_40 = pow_41.mean(-1, keepdim=True) + pow_41 = None + add_120 = variance_40 + 1e-05 + variance_40 = None + rsqrt_40 = torch.rsqrt(add_120) + add_120 = None + hidden_states_221 = hidden_states_220 * rsqrt_40 + hidden_states_220 = rsqrt_40 = None + to_81 = hidden_states_221.to(torch.bfloat16) + hidden_states_221 = None + hidden_states_222 = ( + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + * to_81 + ) + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + to_81 + ) = None + qkv_20 = torch._C._nn.linear( + hidden_states_222, + l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_222 = l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_40 = qkv_20[(Ellipsis, slice(None, 3072, None))] + key_states_40 = qkv_20[(Ellipsis, slice(3072, 4096, None))] + value_states_40 = qkv_20[(Ellipsis, slice(4096, None, None))] + qkv_20 = None + view_60 = query_states_40.view((1, 2, -1, 128)) + query_states_40 = None + query_states_41 = view_60.transpose(1, 2) + view_60 = None + view_61 = key_states_40.view((1, 2, -1, 128)) + key_states_40 = None + key_states_41 = view_61.transpose(1, 2) + view_61 = None + view_62 = value_states_40.view((1, 2, -1, 128)) + value_states_40 = None + value_states_41 = view_62.transpose(1, 2) + view_62 = None + cos_20 = l_stack0_0_.unsqueeze(1) + sin_20 = l_stack0_1_.unsqueeze(1) + q_rot_20 = query_states_41[(Ellipsis, slice(None, 96, None))] + q_pass_20 = query_states_41[(Ellipsis, slice(96, None, None))] + query_states_41 = None + k_rot_20 = key_states_41[(Ellipsis, slice(None, 96, None))] + k_pass_20 = key_states_41[(Ellipsis, slice(96, None, None))] + key_states_41 = None + mul_182 = q_rot_20 * cos_20 + x1_40 = q_rot_20[(Ellipsis, slice(None, 48, None))] + x2_40 = q_rot_20[(Ellipsis, slice(48, None, None))] + q_rot_20 = None + neg_40 = -x2_40 + x2_40 = None + cat_80 = torch.cat((neg_40, x1_40), dim=-1) + neg_40 = x1_40 = None + mul_183 = cat_80 * sin_20 + cat_80 = None + add_121 = mul_182 + mul_183 + mul_182 = mul_183 = None + q_embed_20 = torch.cat([add_121, q_pass_20], dim=-1) + add_121 = q_pass_20 = None + mul_184 = k_rot_20 * cos_20 + cos_20 = None + x1_41 = k_rot_20[(Ellipsis, slice(None, 48, None))] + x2_41 = k_rot_20[(Ellipsis, slice(48, None, None))] + k_rot_20 = None + neg_41 = -x2_41 + x2_41 = None + cat_82 = torch.cat((neg_41, x1_41), dim=-1) + neg_41 = x1_41 = None + mul_185 = cat_82 * sin_20 + cat_82 = sin_20 = None + add_122 = mul_184 + mul_185 + mul_184 = mul_185 = None + k_embed_20 = torch.cat([add_122, k_pass_20], dim=-1) + add_122 = k_pass_20 = None + getitem_331 = k_embed_20[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_223 = getitem_331.expand(1, 8, 3, 2, 128) + getitem_331 = None + key_40 = hidden_states_223.reshape(1, 24, 2, 128) + hidden_states_223 = None + getitem_332 = value_states_41[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_224 = getitem_332.expand(1, 8, 3, 2, 128) + getitem_332 = None + value_40 = hidden_states_224.reshape(1, 24, 2, 128) + hidden_states_224 = None + attention_mask_20 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_20 = q_embed_20.contiguous() + q_embed_20 = None + key_41 = key_40.contiguous() + key_40 = None + value_41 = value_40.contiguous() + value_40 = None + attn_output_80 = torch._C._nn.scaled_dot_product_attention( + query_20, + key_41, + value_41, + attn_mask=attention_mask_20, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_20 = key_41 = value_41 = attention_mask_20 = None + transpose_83 = attn_output_80.transpose(1, 2) + attn_output_80 = None + attn_output_81 = transpose_83.contiguous() + transpose_83 = None + reshape_62 = attn_output_81.reshape(1, 2, -1) + attn_output_81 = None + attn_output_82 = reshape_62.contiguous() + reshape_62 = None + attn_output_83 = torch._C._nn.linear( + attn_output_82, + l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_82 = l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_40 = torch.nn.functional.dropout(attn_output_83, 0.0, False, False) + attn_output_83 = None + hidden_states_225 = hidden_states_219 + dropout_40 + hidden_states_219 = dropout_40 = None + hidden_states_226 = hidden_states_225.to(torch.float32) + pow_42 = hidden_states_226.pow(2) + variance_41 = pow_42.mean(-1, keepdim=True) + pow_42 = None + add_124 = variance_41 + 1e-05 + variance_41 = None + rsqrt_41 = torch.rsqrt(add_124) + add_124 = None + hidden_states_227 = hidden_states_226 * rsqrt_41 + hidden_states_226 = rsqrt_41 = None + to_83 = hidden_states_227.to(torch.bfloat16) + hidden_states_227 = None + hidden_states_228 = ( + l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + * to_83 + ) + l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = ( + to_83 + ) = None + up_states_60 = torch._C._nn.linear( + hidden_states_228, + l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_228 = l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_20 = up_states_60.chunk(2, dim=-1) + up_states_60 = None + gate_20 = chunk_20[0] + up_states_61 = chunk_20[1] + chunk_20 = None + silu_20 = torch.nn.functional.silu(gate_20, inplace=False) + gate_20 = None + up_states_62 = up_states_61 * silu_20 + up_states_61 = silu_20 = None + hidden_states_229 = torch._C._nn.linear( + up_states_62, + l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_62 = l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_41 = torch.nn.functional.dropout(hidden_states_229, 0.0, False, False) + hidden_states_229 = None + hidden_states_230 = hidden_states_225 + dropout_41 + hidden_states_225 = dropout_41 = None + hidden_states_231 = hidden_states_230.to(torch.float32) + pow_43 = hidden_states_231.pow(2) + variance_42 = pow_43.mean(-1, keepdim=True) + pow_43 = None + add_126 = variance_42 + 1e-05 + variance_42 = None + rsqrt_42 = torch.rsqrt(add_126) + add_126 = None + hidden_states_232 = hidden_states_231 * rsqrt_42 + hidden_states_231 = rsqrt_42 = None + to_85 = hidden_states_232.to(torch.bfloat16) + hidden_states_232 = None + hidden_states_233 = ( + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + * to_85 + ) + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + to_85 + ) = None + qkv_21 = torch._C._nn.linear( + hidden_states_233, + l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_233 = l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_42 = qkv_21[(Ellipsis, slice(None, 3072, None))] + key_states_42 = qkv_21[(Ellipsis, slice(3072, 4096, None))] + value_states_42 = qkv_21[(Ellipsis, slice(4096, None, None))] + qkv_21 = None + view_63 = query_states_42.view((1, 2, -1, 128)) + query_states_42 = None + query_states_43 = view_63.transpose(1, 2) + view_63 = None + view_64 = key_states_42.view((1, 2, -1, 128)) + key_states_42 = None + key_states_43 = view_64.transpose(1, 2) + view_64 = None + view_65 = value_states_42.view((1, 2, -1, 128)) + value_states_42 = None + value_states_43 = view_65.transpose(1, 2) + view_65 = None + cos_21 = l_stack0_0_.unsqueeze(1) + sin_21 = l_stack0_1_.unsqueeze(1) + q_rot_21 = query_states_43[(Ellipsis, slice(None, 96, None))] + q_pass_21 = query_states_43[(Ellipsis, slice(96, None, None))] + query_states_43 = None + k_rot_21 = key_states_43[(Ellipsis, slice(None, 96, None))] + k_pass_21 = key_states_43[(Ellipsis, slice(96, None, None))] + key_states_43 = None + mul_191 = q_rot_21 * cos_21 + x1_42 = q_rot_21[(Ellipsis, slice(None, 48, None))] + x2_42 = q_rot_21[(Ellipsis, slice(48, None, None))] + q_rot_21 = None + neg_42 = -x2_42 + x2_42 = None + cat_84 = torch.cat((neg_42, x1_42), dim=-1) + neg_42 = x1_42 = None + mul_192 = cat_84 * sin_21 + cat_84 = None + add_127 = mul_191 + mul_192 + mul_191 = mul_192 = None + q_embed_21 = torch.cat([add_127, q_pass_21], dim=-1) + add_127 = q_pass_21 = None + mul_193 = k_rot_21 * cos_21 + cos_21 = None + x1_43 = k_rot_21[(Ellipsis, slice(None, 48, None))] + x2_43 = k_rot_21[(Ellipsis, slice(48, None, None))] + k_rot_21 = None + neg_43 = -x2_43 + x2_43 = None + cat_86 = torch.cat((neg_43, x1_43), dim=-1) + neg_43 = x1_43 = None + mul_194 = cat_86 * sin_21 + cat_86 = sin_21 = None + add_128 = mul_193 + mul_194 + mul_193 = mul_194 = None + k_embed_21 = torch.cat([add_128, k_pass_21], dim=-1) + add_128 = k_pass_21 = None + getitem_347 = k_embed_21[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_234 = getitem_347.expand(1, 8, 3, 2, 128) + getitem_347 = None + key_42 = hidden_states_234.reshape(1, 24, 2, 128) + hidden_states_234 = None + getitem_348 = value_states_43[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_235 = getitem_348.expand(1, 8, 3, 2, 128) + getitem_348 = None + value_42 = hidden_states_235.reshape(1, 24, 2, 128) + hidden_states_235 = None + attention_mask_21 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_21 = q_embed_21.contiguous() + q_embed_21 = None + key_43 = key_42.contiguous() + key_42 = None + value_43 = value_42.contiguous() + value_42 = None + attn_output_84 = torch._C._nn.scaled_dot_product_attention( + query_21, + key_43, + value_43, + attn_mask=attention_mask_21, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_21 = key_43 = value_43 = attention_mask_21 = None + transpose_87 = attn_output_84.transpose(1, 2) + attn_output_84 = None + attn_output_85 = transpose_87.contiguous() + transpose_87 = None + reshape_65 = attn_output_85.reshape(1, 2, -1) + attn_output_85 = None + attn_output_86 = reshape_65.contiguous() + reshape_65 = None + attn_output_87 = torch._C._nn.linear( + attn_output_86, + l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_86 = l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_42 = torch.nn.functional.dropout(attn_output_87, 0.0, False, False) + attn_output_87 = None + hidden_states_236 = hidden_states_230 + dropout_42 + hidden_states_230 = dropout_42 = None + hidden_states_237 = hidden_states_236.to(torch.float32) + pow_44 = hidden_states_237.pow(2) + variance_43 = pow_44.mean(-1, keepdim=True) + pow_44 = None + add_130 = variance_43 + 1e-05 + variance_43 = None + rsqrt_43 = torch.rsqrt(add_130) + add_130 = None + hidden_states_238 = hidden_states_237 * rsqrt_43 + hidden_states_237 = rsqrt_43 = None + to_87 = hidden_states_238.to(torch.bfloat16) + hidden_states_238 = None + hidden_states_239 = ( + l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + * to_87 + ) + l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = ( + to_87 + ) = None + up_states_63 = torch._C._nn.linear( + hidden_states_239, + l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_239 = l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_21 = up_states_63.chunk(2, dim=-1) + up_states_63 = None + gate_21 = chunk_21[0] + up_states_64 = chunk_21[1] + chunk_21 = None + silu_21 = torch.nn.functional.silu(gate_21, inplace=False) + gate_21 = None + up_states_65 = up_states_64 * silu_21 + up_states_64 = silu_21 = None + hidden_states_240 = torch._C._nn.linear( + up_states_65, + l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_65 = l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_43 = torch.nn.functional.dropout(hidden_states_240, 0.0, False, False) + hidden_states_240 = None + hidden_states_241 = hidden_states_236 + dropout_43 + hidden_states_236 = dropout_43 = None + hidden_states_242 = hidden_states_241.to(torch.float32) + pow_45 = hidden_states_242.pow(2) + variance_44 = pow_45.mean(-1, keepdim=True) + pow_45 = None + add_132 = variance_44 + 1e-05 + variance_44 = None + rsqrt_44 = torch.rsqrt(add_132) + add_132 = None + hidden_states_243 = hidden_states_242 * rsqrt_44 + hidden_states_242 = rsqrt_44 = None + to_89 = hidden_states_243.to(torch.bfloat16) + hidden_states_243 = None + hidden_states_244 = ( + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + * to_89 + ) + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + to_89 + ) = None + qkv_22 = torch._C._nn.linear( + hidden_states_244, + l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_244 = l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_44 = qkv_22[(Ellipsis, slice(None, 3072, None))] + key_states_44 = qkv_22[(Ellipsis, slice(3072, 4096, None))] + value_states_44 = qkv_22[(Ellipsis, slice(4096, None, None))] + qkv_22 = None + view_66 = query_states_44.view((1, 2, -1, 128)) + query_states_44 = None + query_states_45 = view_66.transpose(1, 2) + view_66 = None + view_67 = key_states_44.view((1, 2, -1, 128)) + key_states_44 = None + key_states_45 = view_67.transpose(1, 2) + view_67 = None + view_68 = value_states_44.view((1, 2, -1, 128)) + value_states_44 = None + value_states_45 = view_68.transpose(1, 2) + view_68 = None + cos_22 = l_stack0_0_.unsqueeze(1) + sin_22 = l_stack0_1_.unsqueeze(1) + q_rot_22 = query_states_45[(Ellipsis, slice(None, 96, None))] + q_pass_22 = query_states_45[(Ellipsis, slice(96, None, None))] + query_states_45 = None + k_rot_22 = key_states_45[(Ellipsis, slice(None, 96, None))] + k_pass_22 = key_states_45[(Ellipsis, slice(96, None, None))] + key_states_45 = None + mul_200 = q_rot_22 * cos_22 + x1_44 = q_rot_22[(Ellipsis, slice(None, 48, None))] + x2_44 = q_rot_22[(Ellipsis, slice(48, None, None))] + q_rot_22 = None + neg_44 = -x2_44 + x2_44 = None + cat_88 = torch.cat((neg_44, x1_44), dim=-1) + neg_44 = x1_44 = None + mul_201 = cat_88 * sin_22 + cat_88 = None + add_133 = mul_200 + mul_201 + mul_200 = mul_201 = None + q_embed_22 = torch.cat([add_133, q_pass_22], dim=-1) + add_133 = q_pass_22 = None + mul_202 = k_rot_22 * cos_22 + cos_22 = None + x1_45 = k_rot_22[(Ellipsis, slice(None, 48, None))] + x2_45 = k_rot_22[(Ellipsis, slice(48, None, None))] + k_rot_22 = None + neg_45 = -x2_45 + x2_45 = None + cat_90 = torch.cat((neg_45, x1_45), dim=-1) + neg_45 = x1_45 = None + mul_203 = cat_90 * sin_22 + cat_90 = sin_22 = None + add_134 = mul_202 + mul_203 + mul_202 = mul_203 = None + k_embed_22 = torch.cat([add_134, k_pass_22], dim=-1) + add_134 = k_pass_22 = None + getitem_363 = k_embed_22[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_245 = getitem_363.expand(1, 8, 3, 2, 128) + getitem_363 = None + key_44 = hidden_states_245.reshape(1, 24, 2, 128) + hidden_states_245 = None + getitem_364 = value_states_45[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_246 = getitem_364.expand(1, 8, 3, 2, 128) + getitem_364 = None + value_44 = hidden_states_246.reshape(1, 24, 2, 128) + hidden_states_246 = None + attention_mask_22 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_22 = q_embed_22.contiguous() + q_embed_22 = None + key_45 = key_44.contiguous() + key_44 = None + value_45 = value_44.contiguous() + value_44 = None + attn_output_88 = torch._C._nn.scaled_dot_product_attention( + query_22, + key_45, + value_45, + attn_mask=attention_mask_22, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_22 = key_45 = value_45 = attention_mask_22 = None + transpose_91 = attn_output_88.transpose(1, 2) + attn_output_88 = None + attn_output_89 = transpose_91.contiguous() + transpose_91 = None + reshape_68 = attn_output_89.reshape(1, 2, -1) + attn_output_89 = None + attn_output_90 = reshape_68.contiguous() + reshape_68 = None + attn_output_91 = torch._C._nn.linear( + attn_output_90, + l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_90 = l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_44 = torch.nn.functional.dropout(attn_output_91, 0.0, False, False) + attn_output_91 = None + hidden_states_247 = hidden_states_241 + dropout_44 + hidden_states_241 = dropout_44 = None + hidden_states_248 = hidden_states_247.to(torch.float32) + pow_46 = hidden_states_248.pow(2) + variance_45 = pow_46.mean(-1, keepdim=True) + pow_46 = None + add_136 = variance_45 + 1e-05 + variance_45 = None + rsqrt_45 = torch.rsqrt(add_136) + add_136 = None + hidden_states_249 = hidden_states_248 * rsqrt_45 + hidden_states_248 = rsqrt_45 = None + to_91 = hidden_states_249.to(torch.bfloat16) + hidden_states_249 = None + hidden_states_250 = ( + l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ + * to_91 + ) + l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = ( + to_91 + ) = None + up_states_66 = torch._C._nn.linear( + hidden_states_250, + l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_250 = l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_22 = up_states_66.chunk(2, dim=-1) + up_states_66 = None + gate_22 = chunk_22[0] + up_states_67 = chunk_22[1] + chunk_22 = None + silu_22 = torch.nn.functional.silu(gate_22, inplace=False) + gate_22 = None + up_states_68 = up_states_67 * silu_22 + up_states_67 = silu_22 = None + hidden_states_251 = torch._C._nn.linear( + up_states_68, + l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_68 = l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_45 = torch.nn.functional.dropout(hidden_states_251, 0.0, False, False) + hidden_states_251 = None + hidden_states_252 = hidden_states_247 + dropout_45 + hidden_states_247 = dropout_45 = None + hidden_states_253 = hidden_states_252.to(torch.float32) + pow_47 = hidden_states_253.pow(2) + variance_46 = pow_47.mean(-1, keepdim=True) + pow_47 = None + add_138 = variance_46 + 1e-05 + variance_46 = None + rsqrt_46 = torch.rsqrt(add_138) + add_138 = None + hidden_states_254 = hidden_states_253 * rsqrt_46 + hidden_states_253 = rsqrt_46 = None + to_93 = hidden_states_254.to(torch.bfloat16) + hidden_states_254 = None + hidden_states_255 = ( + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + * to_93 + ) + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + to_93 + ) = None + qkv_23 = torch._C._nn.linear( + hidden_states_255, + l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_255 = l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_46 = qkv_23[(Ellipsis, slice(None, 3072, None))] + key_states_46 = qkv_23[(Ellipsis, slice(3072, 4096, None))] + value_states_46 = qkv_23[(Ellipsis, slice(4096, None, None))] + qkv_23 = None + view_69 = query_states_46.view((1, 2, -1, 128)) + query_states_46 = None + query_states_47 = view_69.transpose(1, 2) + view_69 = None + view_70 = key_states_46.view((1, 2, -1, 128)) + key_states_46 = None + key_states_47 = view_70.transpose(1, 2) + view_70 = None + view_71 = value_states_46.view((1, 2, -1, 128)) + value_states_46 = None + value_states_47 = view_71.transpose(1, 2) + view_71 = None + cos_23 = l_stack0_0_.unsqueeze(1) + sin_23 = l_stack0_1_.unsqueeze(1) + q_rot_23 = query_states_47[(Ellipsis, slice(None, 96, None))] + q_pass_23 = query_states_47[(Ellipsis, slice(96, None, None))] + query_states_47 = None + k_rot_23 = key_states_47[(Ellipsis, slice(None, 96, None))] + k_pass_23 = key_states_47[(Ellipsis, slice(96, None, None))] + key_states_47 = None + mul_209 = q_rot_23 * cos_23 + x1_46 = q_rot_23[(Ellipsis, slice(None, 48, None))] + x2_46 = q_rot_23[(Ellipsis, slice(48, None, None))] + q_rot_23 = None + neg_46 = -x2_46 + x2_46 = None + cat_92 = torch.cat((neg_46, x1_46), dim=-1) + neg_46 = x1_46 = None + mul_210 = cat_92 * sin_23 + cat_92 = None + add_139 = mul_209 + mul_210 + mul_209 = mul_210 = None + q_embed_23 = torch.cat([add_139, q_pass_23], dim=-1) + add_139 = q_pass_23 = None + mul_211 = k_rot_23 * cos_23 + cos_23 = None + x1_47 = k_rot_23[(Ellipsis, slice(None, 48, None))] + x2_47 = k_rot_23[(Ellipsis, slice(48, None, None))] + k_rot_23 = None + neg_47 = -x2_47 + x2_47 = None + cat_94 = torch.cat((neg_47, x1_47), dim=-1) + neg_47 = x1_47 = None + mul_212 = cat_94 * sin_23 + cat_94 = sin_23 = None + add_140 = mul_211 + mul_212 + mul_211 = mul_212 = None + k_embed_23 = torch.cat([add_140, k_pass_23], dim=-1) + add_140 = k_pass_23 = None + getitem_379 = k_embed_23[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_256 = getitem_379.expand(1, 8, 3, 2, 128) + getitem_379 = None + key_46 = hidden_states_256.reshape(1, 24, 2, 128) + hidden_states_256 = None + getitem_380 = value_states_47[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_257 = getitem_380.expand(1, 8, 3, 2, 128) + getitem_380 = None + value_46 = hidden_states_257.reshape(1, 24, 2, 128) + hidden_states_257 = None + attention_mask_23 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_23 = q_embed_23.contiguous() + q_embed_23 = None + key_47 = key_46.contiguous() + key_46 = None + value_47 = value_46.contiguous() + value_46 = None + attn_output_92 = torch._C._nn.scaled_dot_product_attention( + query_23, + key_47, + value_47, + attn_mask=attention_mask_23, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_23 = key_47 = value_47 = attention_mask_23 = None + transpose_95 = attn_output_92.transpose(1, 2) + attn_output_92 = None + attn_output_93 = transpose_95.contiguous() + transpose_95 = None + reshape_71 = attn_output_93.reshape(1, 2, -1) + attn_output_93 = None + attn_output_94 = reshape_71.contiguous() + reshape_71 = None + attn_output_95 = torch._C._nn.linear( + attn_output_94, + l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_94 = l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_46 = torch.nn.functional.dropout(attn_output_95, 0.0, False, False) + attn_output_95 = None + hidden_states_258 = hidden_states_252 + dropout_46 + hidden_states_252 = dropout_46 = None + hidden_states_259 = hidden_states_258.to(torch.float32) + pow_48 = hidden_states_259.pow(2) + variance_47 = pow_48.mean(-1, keepdim=True) + pow_48 = None + add_142 = variance_47 + 1e-05 + variance_47 = None + rsqrt_47 = torch.rsqrt(add_142) + add_142 = None + hidden_states_260 = hidden_states_259 * rsqrt_47 + hidden_states_259 = rsqrt_47 = None + to_95 = hidden_states_260.to(torch.bfloat16) + hidden_states_260 = None + hidden_states_261 = ( + l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ + * to_95 + ) + l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = ( + to_95 + ) = None + up_states_69 = torch._C._nn.linear( + hidden_states_261, + l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_261 = l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_23 = up_states_69.chunk(2, dim=-1) + up_states_69 = None + gate_23 = chunk_23[0] + up_states_70 = chunk_23[1] + chunk_23 = None + silu_23 = torch.nn.functional.silu(gate_23, inplace=False) + gate_23 = None + up_states_71 = up_states_70 * silu_23 + up_states_70 = silu_23 = None + hidden_states_262 = torch._C._nn.linear( + up_states_71, + l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_71 = l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_47 = torch.nn.functional.dropout(hidden_states_262, 0.0, False, False) + hidden_states_262 = None + hidden_states_263 = hidden_states_258 + dropout_47 + hidden_states_258 = dropout_47 = None + hidden_states_264 = hidden_states_263.to(torch.float32) + pow_49 = hidden_states_264.pow(2) + variance_48 = pow_49.mean(-1, keepdim=True) + pow_49 = None + add_144 = variance_48 + 1e-05 + variance_48 = None + rsqrt_48 = torch.rsqrt(add_144) + add_144 = None + hidden_states_265 = hidden_states_264 * rsqrt_48 + hidden_states_264 = rsqrt_48 = None + to_97 = hidden_states_265.to(torch.bfloat16) + hidden_states_265 = None + hidden_states_266 = ( + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ + * to_97 + ) + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ = ( + to_97 + ) = None + qkv_24 = torch._C._nn.linear( + hidden_states_266, + l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_266 = l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_48 = qkv_24[(Ellipsis, slice(None, 3072, None))] + key_states_48 = qkv_24[(Ellipsis, slice(3072, 4096, None))] + value_states_48 = qkv_24[(Ellipsis, slice(4096, None, None))] + qkv_24 = None + view_72 = query_states_48.view((1, 2, -1, 128)) + query_states_48 = None + query_states_49 = view_72.transpose(1, 2) + view_72 = None + view_73 = key_states_48.view((1, 2, -1, 128)) + key_states_48 = None + key_states_49 = view_73.transpose(1, 2) + view_73 = None + view_74 = value_states_48.view((1, 2, -1, 128)) + value_states_48 = None + value_states_49 = view_74.transpose(1, 2) + view_74 = None + cos_24 = l_stack0_0_.unsqueeze(1) + sin_24 = l_stack0_1_.unsqueeze(1) + q_rot_24 = query_states_49[(Ellipsis, slice(None, 96, None))] + q_pass_24 = query_states_49[(Ellipsis, slice(96, None, None))] + query_states_49 = None + k_rot_24 = key_states_49[(Ellipsis, slice(None, 96, None))] + k_pass_24 = key_states_49[(Ellipsis, slice(96, None, None))] + key_states_49 = None + mul_218 = q_rot_24 * cos_24 + x1_48 = q_rot_24[(Ellipsis, slice(None, 48, None))] + x2_48 = q_rot_24[(Ellipsis, slice(48, None, None))] + q_rot_24 = None + neg_48 = -x2_48 + x2_48 = None + cat_96 = torch.cat((neg_48, x1_48), dim=-1) + neg_48 = x1_48 = None + mul_219 = cat_96 * sin_24 + cat_96 = None + add_145 = mul_218 + mul_219 + mul_218 = mul_219 = None + q_embed_24 = torch.cat([add_145, q_pass_24], dim=-1) + add_145 = q_pass_24 = None + mul_220 = k_rot_24 * cos_24 + cos_24 = None + x1_49 = k_rot_24[(Ellipsis, slice(None, 48, None))] + x2_49 = k_rot_24[(Ellipsis, slice(48, None, None))] + k_rot_24 = None + neg_49 = -x2_49 + x2_49 = None + cat_98 = torch.cat((neg_49, x1_49), dim=-1) + neg_49 = x1_49 = None + mul_221 = cat_98 * sin_24 + cat_98 = sin_24 = None + add_146 = mul_220 + mul_221 + mul_220 = mul_221 = None + k_embed_24 = torch.cat([add_146, k_pass_24], dim=-1) + add_146 = k_pass_24 = None + getitem_395 = k_embed_24[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_267 = getitem_395.expand(1, 8, 3, 2, 128) + getitem_395 = None + key_48 = hidden_states_267.reshape(1, 24, 2, 128) + hidden_states_267 = None + getitem_396 = value_states_49[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_268 = getitem_396.expand(1, 8, 3, 2, 128) + getitem_396 = None + value_48 = hidden_states_268.reshape(1, 24, 2, 128) + hidden_states_268 = None + attention_mask_24 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_24 = q_embed_24.contiguous() + q_embed_24 = None + key_49 = key_48.contiguous() + key_48 = None + value_49 = value_48.contiguous() + value_48 = None + attn_output_96 = torch._C._nn.scaled_dot_product_attention( + query_24, + key_49, + value_49, + attn_mask=attention_mask_24, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_24 = key_49 = value_49 = attention_mask_24 = None + transpose_99 = attn_output_96.transpose(1, 2) + attn_output_96 = None + attn_output_97 = transpose_99.contiguous() + transpose_99 = None + reshape_74 = attn_output_97.reshape(1, 2, -1) + attn_output_97 = None + attn_output_98 = reshape_74.contiguous() + reshape_74 = None + attn_output_99 = torch._C._nn.linear( + attn_output_98, + l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_98 = l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_48 = torch.nn.functional.dropout(attn_output_99, 0.0, False, False) + attn_output_99 = None + hidden_states_269 = hidden_states_263 + dropout_48 + hidden_states_263 = dropout_48 = None + hidden_states_270 = hidden_states_269.to(torch.float32) + pow_50 = hidden_states_270.pow(2) + variance_49 = pow_50.mean(-1, keepdim=True) + pow_50 = None + add_148 = variance_49 + 1e-05 + variance_49 = None + rsqrt_49 = torch.rsqrt(add_148) + add_148 = None + hidden_states_271 = hidden_states_270 * rsqrt_49 + hidden_states_270 = rsqrt_49 = None + to_99 = hidden_states_271.to(torch.bfloat16) + hidden_states_271 = None + hidden_states_272 = ( + l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ + * to_99 + ) + l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ = ( + to_99 + ) = None + up_states_72 = torch._C._nn.linear( + hidden_states_272, + l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_272 = l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_24 = up_states_72.chunk(2, dim=-1) + up_states_72 = None + gate_24 = chunk_24[0] + up_states_73 = chunk_24[1] + chunk_24 = None + silu_24 = torch.nn.functional.silu(gate_24, inplace=False) + gate_24 = None + up_states_74 = up_states_73 * silu_24 + up_states_73 = silu_24 = None + hidden_states_273 = torch._C._nn.linear( + up_states_74, + l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_74 = l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_49 = torch.nn.functional.dropout(hidden_states_273, 0.0, False, False) + hidden_states_273 = None + hidden_states_274 = hidden_states_269 + dropout_49 + hidden_states_269 = dropout_49 = None + hidden_states_275 = hidden_states_274.to(torch.float32) + pow_51 = hidden_states_275.pow(2) + variance_50 = pow_51.mean(-1, keepdim=True) + pow_51 = None + add_150 = variance_50 + 1e-05 + variance_50 = None + rsqrt_50 = torch.rsqrt(add_150) + add_150 = None + hidden_states_276 = hidden_states_275 * rsqrt_50 + hidden_states_275 = rsqrt_50 = None + to_101 = hidden_states_276.to(torch.bfloat16) + hidden_states_276 = None + hidden_states_277 = ( + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ + * to_101 + ) + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ = ( + to_101 + ) = None + qkv_25 = torch._C._nn.linear( + hidden_states_277, + l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_277 = l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_50 = qkv_25[(Ellipsis, slice(None, 3072, None))] + key_states_50 = qkv_25[(Ellipsis, slice(3072, 4096, None))] + value_states_50 = qkv_25[(Ellipsis, slice(4096, None, None))] + qkv_25 = None + view_75 = query_states_50.view((1, 2, -1, 128)) + query_states_50 = None + query_states_51 = view_75.transpose(1, 2) + view_75 = None + view_76 = key_states_50.view((1, 2, -1, 128)) + key_states_50 = None + key_states_51 = view_76.transpose(1, 2) + view_76 = None + view_77 = value_states_50.view((1, 2, -1, 128)) + value_states_50 = None + value_states_51 = view_77.transpose(1, 2) + view_77 = None + cos_25 = l_stack0_0_.unsqueeze(1) + sin_25 = l_stack0_1_.unsqueeze(1) + q_rot_25 = query_states_51[(Ellipsis, slice(None, 96, None))] + q_pass_25 = query_states_51[(Ellipsis, slice(96, None, None))] + query_states_51 = None + k_rot_25 = key_states_51[(Ellipsis, slice(None, 96, None))] + k_pass_25 = key_states_51[(Ellipsis, slice(96, None, None))] + key_states_51 = None + mul_227 = q_rot_25 * cos_25 + x1_50 = q_rot_25[(Ellipsis, slice(None, 48, None))] + x2_50 = q_rot_25[(Ellipsis, slice(48, None, None))] + q_rot_25 = None + neg_50 = -x2_50 + x2_50 = None + cat_100 = torch.cat((neg_50, x1_50), dim=-1) + neg_50 = x1_50 = None + mul_228 = cat_100 * sin_25 + cat_100 = None + add_151 = mul_227 + mul_228 + mul_227 = mul_228 = None + q_embed_25 = torch.cat([add_151, q_pass_25], dim=-1) + add_151 = q_pass_25 = None + mul_229 = k_rot_25 * cos_25 + cos_25 = None + x1_51 = k_rot_25[(Ellipsis, slice(None, 48, None))] + x2_51 = k_rot_25[(Ellipsis, slice(48, None, None))] + k_rot_25 = None + neg_51 = -x2_51 + x2_51 = None + cat_102 = torch.cat((neg_51, x1_51), dim=-1) + neg_51 = x1_51 = None + mul_230 = cat_102 * sin_25 + cat_102 = sin_25 = None + add_152 = mul_229 + mul_230 + mul_229 = mul_230 = None + k_embed_25 = torch.cat([add_152, k_pass_25], dim=-1) + add_152 = k_pass_25 = None + getitem_411 = k_embed_25[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_278 = getitem_411.expand(1, 8, 3, 2, 128) + getitem_411 = None + key_50 = hidden_states_278.reshape(1, 24, 2, 128) + hidden_states_278 = None + getitem_412 = value_states_51[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_279 = getitem_412.expand(1, 8, 3, 2, 128) + getitem_412 = None + value_50 = hidden_states_279.reshape(1, 24, 2, 128) + hidden_states_279 = None + attention_mask_25 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_25 = q_embed_25.contiguous() + q_embed_25 = None + key_51 = key_50.contiguous() + key_50 = None + value_51 = value_50.contiguous() + value_50 = None + attn_output_100 = torch._C._nn.scaled_dot_product_attention( + query_25, + key_51, + value_51, + attn_mask=attention_mask_25, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_25 = key_51 = value_51 = attention_mask_25 = None + transpose_103 = attn_output_100.transpose(1, 2) + attn_output_100 = None + attn_output_101 = transpose_103.contiguous() + transpose_103 = None + reshape_77 = attn_output_101.reshape(1, 2, -1) + attn_output_101 = None + attn_output_102 = reshape_77.contiguous() + reshape_77 = None + attn_output_103 = torch._C._nn.linear( + attn_output_102, + l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_102 = l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_50 = torch.nn.functional.dropout(attn_output_103, 0.0, False, False) + attn_output_103 = None + hidden_states_280 = hidden_states_274 + dropout_50 + hidden_states_274 = dropout_50 = None + hidden_states_281 = hidden_states_280.to(torch.float32) + pow_52 = hidden_states_281.pow(2) + variance_51 = pow_52.mean(-1, keepdim=True) + pow_52 = None + add_154 = variance_51 + 1e-05 + variance_51 = None + rsqrt_51 = torch.rsqrt(add_154) + add_154 = None + hidden_states_282 = hidden_states_281 * rsqrt_51 + hidden_states_281 = rsqrt_51 = None + to_103 = hidden_states_282.to(torch.bfloat16) + hidden_states_282 = None + hidden_states_283 = ( + l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ + * to_103 + ) + l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ = ( + to_103 + ) = None + up_states_75 = torch._C._nn.linear( + hidden_states_283, + l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_283 = l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_25 = up_states_75.chunk(2, dim=-1) + up_states_75 = None + gate_25 = chunk_25[0] + up_states_76 = chunk_25[1] + chunk_25 = None + silu_25 = torch.nn.functional.silu(gate_25, inplace=False) + gate_25 = None + up_states_77 = up_states_76 * silu_25 + up_states_76 = silu_25 = None + hidden_states_284 = torch._C._nn.linear( + up_states_77, + l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_77 = l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_51 = torch.nn.functional.dropout(hidden_states_284, 0.0, False, False) + hidden_states_284 = None + hidden_states_285 = hidden_states_280 + dropout_51 + hidden_states_280 = dropout_51 = None + hidden_states_286 = hidden_states_285.to(torch.float32) + pow_53 = hidden_states_286.pow(2) + variance_52 = pow_53.mean(-1, keepdim=True) + pow_53 = None + add_156 = variance_52 + 1e-05 + variance_52 = None + rsqrt_52 = torch.rsqrt(add_156) + add_156 = None + hidden_states_287 = hidden_states_286 * rsqrt_52 + hidden_states_286 = rsqrt_52 = None + to_105 = hidden_states_287.to(torch.bfloat16) + hidden_states_287 = None + hidden_states_288 = ( + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ + * to_105 + ) + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ = ( + to_105 + ) = None + qkv_26 = torch._C._nn.linear( + hidden_states_288, + l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_288 = l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_52 = qkv_26[(Ellipsis, slice(None, 3072, None))] + key_states_52 = qkv_26[(Ellipsis, slice(3072, 4096, None))] + value_states_52 = qkv_26[(Ellipsis, slice(4096, None, None))] + qkv_26 = None + view_78 = query_states_52.view((1, 2, -1, 128)) + query_states_52 = None + query_states_53 = view_78.transpose(1, 2) + view_78 = None + view_79 = key_states_52.view((1, 2, -1, 128)) + key_states_52 = None + key_states_53 = view_79.transpose(1, 2) + view_79 = None + view_80 = value_states_52.view((1, 2, -1, 128)) + value_states_52 = None + value_states_53 = view_80.transpose(1, 2) + view_80 = None + cos_26 = l_stack0_0_.unsqueeze(1) + sin_26 = l_stack0_1_.unsqueeze(1) + q_rot_26 = query_states_53[(Ellipsis, slice(None, 96, None))] + q_pass_26 = query_states_53[(Ellipsis, slice(96, None, None))] + query_states_53 = None + k_rot_26 = key_states_53[(Ellipsis, slice(None, 96, None))] + k_pass_26 = key_states_53[(Ellipsis, slice(96, None, None))] + key_states_53 = None + mul_236 = q_rot_26 * cos_26 + x1_52 = q_rot_26[(Ellipsis, slice(None, 48, None))] + x2_52 = q_rot_26[(Ellipsis, slice(48, None, None))] + q_rot_26 = None + neg_52 = -x2_52 + x2_52 = None + cat_104 = torch.cat((neg_52, x1_52), dim=-1) + neg_52 = x1_52 = None + mul_237 = cat_104 * sin_26 + cat_104 = None + add_157 = mul_236 + mul_237 + mul_236 = mul_237 = None + q_embed_26 = torch.cat([add_157, q_pass_26], dim=-1) + add_157 = q_pass_26 = None + mul_238 = k_rot_26 * cos_26 + cos_26 = None + x1_53 = k_rot_26[(Ellipsis, slice(None, 48, None))] + x2_53 = k_rot_26[(Ellipsis, slice(48, None, None))] + k_rot_26 = None + neg_53 = -x2_53 + x2_53 = None + cat_106 = torch.cat((neg_53, x1_53), dim=-1) + neg_53 = x1_53 = None + mul_239 = cat_106 * sin_26 + cat_106 = sin_26 = None + add_158 = mul_238 + mul_239 + mul_238 = mul_239 = None + k_embed_26 = torch.cat([add_158, k_pass_26], dim=-1) + add_158 = k_pass_26 = None + getitem_427 = k_embed_26[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_289 = getitem_427.expand(1, 8, 3, 2, 128) + getitem_427 = None + key_52 = hidden_states_289.reshape(1, 24, 2, 128) + hidden_states_289 = None + getitem_428 = value_states_53[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_290 = getitem_428.expand(1, 8, 3, 2, 128) + getitem_428 = None + value_52 = hidden_states_290.reshape(1, 24, 2, 128) + hidden_states_290 = None + attention_mask_26 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_26 = q_embed_26.contiguous() + q_embed_26 = None + key_53 = key_52.contiguous() + key_52 = None + value_53 = value_52.contiguous() + value_52 = None + attn_output_104 = torch._C._nn.scaled_dot_product_attention( + query_26, + key_53, + value_53, + attn_mask=attention_mask_26, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_26 = key_53 = value_53 = attention_mask_26 = None + transpose_107 = attn_output_104.transpose(1, 2) + attn_output_104 = None + attn_output_105 = transpose_107.contiguous() + transpose_107 = None + reshape_80 = attn_output_105.reshape(1, 2, -1) + attn_output_105 = None + attn_output_106 = reshape_80.contiguous() + reshape_80 = None + attn_output_107 = torch._C._nn.linear( + attn_output_106, + l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_106 = l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_52 = torch.nn.functional.dropout(attn_output_107, 0.0, False, False) + attn_output_107 = None + hidden_states_291 = hidden_states_285 + dropout_52 + hidden_states_285 = dropout_52 = None + hidden_states_292 = hidden_states_291.to(torch.float32) + pow_54 = hidden_states_292.pow(2) + variance_53 = pow_54.mean(-1, keepdim=True) + pow_54 = None + add_160 = variance_53 + 1e-05 + variance_53 = None + rsqrt_53 = torch.rsqrt(add_160) + add_160 = None + hidden_states_293 = hidden_states_292 * rsqrt_53 + hidden_states_292 = rsqrt_53 = None + to_107 = hidden_states_293.to(torch.bfloat16) + hidden_states_293 = None + hidden_states_294 = ( + l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ + * to_107 + ) + l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ = ( + to_107 + ) = None + up_states_78 = torch._C._nn.linear( + hidden_states_294, + l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_294 = l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_26 = up_states_78.chunk(2, dim=-1) + up_states_78 = None + gate_26 = chunk_26[0] + up_states_79 = chunk_26[1] + chunk_26 = None + silu_26 = torch.nn.functional.silu(gate_26, inplace=False) + gate_26 = None + up_states_80 = up_states_79 * silu_26 + up_states_79 = silu_26 = None + hidden_states_295 = torch._C._nn.linear( + up_states_80, + l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_80 = l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_53 = torch.nn.functional.dropout(hidden_states_295, 0.0, False, False) + hidden_states_295 = None + hidden_states_296 = hidden_states_291 + dropout_53 + hidden_states_291 = dropout_53 = None + hidden_states_297 = hidden_states_296.to(torch.float32) + pow_55 = hidden_states_297.pow(2) + variance_54 = pow_55.mean(-1, keepdim=True) + pow_55 = None + add_162 = variance_54 + 1e-05 + variance_54 = None + rsqrt_54 = torch.rsqrt(add_162) + add_162 = None + hidden_states_298 = hidden_states_297 * rsqrt_54 + hidden_states_297 = rsqrt_54 = None + to_109 = hidden_states_298.to(torch.bfloat16) + hidden_states_298 = None + hidden_states_299 = ( + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ + * to_109 + ) + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ = ( + to_109 + ) = None + qkv_27 = torch._C._nn.linear( + hidden_states_299, + l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_299 = l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_54 = qkv_27[(Ellipsis, slice(None, 3072, None))] + key_states_54 = qkv_27[(Ellipsis, slice(3072, 4096, None))] + value_states_54 = qkv_27[(Ellipsis, slice(4096, None, None))] + qkv_27 = None + view_81 = query_states_54.view((1, 2, -1, 128)) + query_states_54 = None + query_states_55 = view_81.transpose(1, 2) + view_81 = None + view_82 = key_states_54.view((1, 2, -1, 128)) + key_states_54 = None + key_states_55 = view_82.transpose(1, 2) + view_82 = None + view_83 = value_states_54.view((1, 2, -1, 128)) + value_states_54 = None + value_states_55 = view_83.transpose(1, 2) + view_83 = None + cos_27 = l_stack0_0_.unsqueeze(1) + sin_27 = l_stack0_1_.unsqueeze(1) + q_rot_27 = query_states_55[(Ellipsis, slice(None, 96, None))] + q_pass_27 = query_states_55[(Ellipsis, slice(96, None, None))] + query_states_55 = None + k_rot_27 = key_states_55[(Ellipsis, slice(None, 96, None))] + k_pass_27 = key_states_55[(Ellipsis, slice(96, None, None))] + key_states_55 = None + mul_245 = q_rot_27 * cos_27 + x1_54 = q_rot_27[(Ellipsis, slice(None, 48, None))] + x2_54 = q_rot_27[(Ellipsis, slice(48, None, None))] + q_rot_27 = None + neg_54 = -x2_54 + x2_54 = None + cat_108 = torch.cat((neg_54, x1_54), dim=-1) + neg_54 = x1_54 = None + mul_246 = cat_108 * sin_27 + cat_108 = None + add_163 = mul_245 + mul_246 + mul_245 = mul_246 = None + q_embed_27 = torch.cat([add_163, q_pass_27], dim=-1) + add_163 = q_pass_27 = None + mul_247 = k_rot_27 * cos_27 + cos_27 = None + x1_55 = k_rot_27[(Ellipsis, slice(None, 48, None))] + x2_55 = k_rot_27[(Ellipsis, slice(48, None, None))] + k_rot_27 = None + neg_55 = -x2_55 + x2_55 = None + cat_110 = torch.cat((neg_55, x1_55), dim=-1) + neg_55 = x1_55 = None + mul_248 = cat_110 * sin_27 + cat_110 = sin_27 = None + add_164 = mul_247 + mul_248 + mul_247 = mul_248 = None + k_embed_27 = torch.cat([add_164, k_pass_27], dim=-1) + add_164 = k_pass_27 = None + getitem_443 = k_embed_27[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_300 = getitem_443.expand(1, 8, 3, 2, 128) + getitem_443 = None + key_54 = hidden_states_300.reshape(1, 24, 2, 128) + hidden_states_300 = None + getitem_444 = value_states_55[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_301 = getitem_444.expand(1, 8, 3, 2, 128) + getitem_444 = None + value_54 = hidden_states_301.reshape(1, 24, 2, 128) + hidden_states_301 = None + attention_mask_27 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_27 = q_embed_27.contiguous() + q_embed_27 = None + key_55 = key_54.contiguous() + key_54 = None + value_55 = value_54.contiguous() + value_54 = None + attn_output_108 = torch._C._nn.scaled_dot_product_attention( + query_27, + key_55, + value_55, + attn_mask=attention_mask_27, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_27 = key_55 = value_55 = attention_mask_27 = None + transpose_111 = attn_output_108.transpose(1, 2) + attn_output_108 = None + attn_output_109 = transpose_111.contiguous() + transpose_111 = None + reshape_83 = attn_output_109.reshape(1, 2, -1) + attn_output_109 = None + attn_output_110 = reshape_83.contiguous() + reshape_83 = None + attn_output_111 = torch._C._nn.linear( + attn_output_110, + l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_110 = l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_54 = torch.nn.functional.dropout(attn_output_111, 0.0, False, False) + attn_output_111 = None + hidden_states_302 = hidden_states_296 + dropout_54 + hidden_states_296 = dropout_54 = None + hidden_states_303 = hidden_states_302.to(torch.float32) + pow_56 = hidden_states_303.pow(2) + variance_55 = pow_56.mean(-1, keepdim=True) + pow_56 = None + add_166 = variance_55 + 1e-05 + variance_55 = None + rsqrt_55 = torch.rsqrt(add_166) + add_166 = None + hidden_states_304 = hidden_states_303 * rsqrt_55 + hidden_states_303 = rsqrt_55 = None + to_111 = hidden_states_304.to(torch.bfloat16) + hidden_states_304 = None + hidden_states_305 = ( + l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ + * to_111 + ) + l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ = ( + to_111 + ) = None + up_states_81 = torch._C._nn.linear( + hidden_states_305, + l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_305 = l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_27 = up_states_81.chunk(2, dim=-1) + up_states_81 = None + gate_27 = chunk_27[0] + up_states_82 = chunk_27[1] + chunk_27 = None + silu_27 = torch.nn.functional.silu(gate_27, inplace=False) + gate_27 = None + up_states_83 = up_states_82 * silu_27 + up_states_82 = silu_27 = None + hidden_states_306 = torch._C._nn.linear( + up_states_83, + l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_83 = l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_55 = torch.nn.functional.dropout(hidden_states_306, 0.0, False, False) + hidden_states_306 = None + hidden_states_307 = hidden_states_302 + dropout_55 + hidden_states_302 = dropout_55 = None + hidden_states_308 = hidden_states_307.to(torch.float32) + pow_57 = hidden_states_308.pow(2) + variance_56 = pow_57.mean(-1, keepdim=True) + pow_57 = None + add_168 = variance_56 + 1e-05 + variance_56 = None + rsqrt_56 = torch.rsqrt(add_168) + add_168 = None + hidden_states_309 = hidden_states_308 * rsqrt_56 + hidden_states_308 = rsqrt_56 = None + to_113 = hidden_states_309.to(torch.bfloat16) + hidden_states_309 = None + hidden_states_310 = ( + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ + * to_113 + ) + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ = ( + to_113 + ) = None + qkv_28 = torch._C._nn.linear( + hidden_states_310, + l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_310 = l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_56 = qkv_28[(Ellipsis, slice(None, 3072, None))] + key_states_56 = qkv_28[(Ellipsis, slice(3072, 4096, None))] + value_states_56 = qkv_28[(Ellipsis, slice(4096, None, None))] + qkv_28 = None + view_84 = query_states_56.view((1, 2, -1, 128)) + query_states_56 = None + query_states_57 = view_84.transpose(1, 2) + view_84 = None + view_85 = key_states_56.view((1, 2, -1, 128)) + key_states_56 = None + key_states_57 = view_85.transpose(1, 2) + view_85 = None + view_86 = value_states_56.view((1, 2, -1, 128)) + value_states_56 = None + value_states_57 = view_86.transpose(1, 2) + view_86 = None + cos_28 = l_stack0_0_.unsqueeze(1) + sin_28 = l_stack0_1_.unsqueeze(1) + q_rot_28 = query_states_57[(Ellipsis, slice(None, 96, None))] + q_pass_28 = query_states_57[(Ellipsis, slice(96, None, None))] + query_states_57 = None + k_rot_28 = key_states_57[(Ellipsis, slice(None, 96, None))] + k_pass_28 = key_states_57[(Ellipsis, slice(96, None, None))] + key_states_57 = None + mul_254 = q_rot_28 * cos_28 + x1_56 = q_rot_28[(Ellipsis, slice(None, 48, None))] + x2_56 = q_rot_28[(Ellipsis, slice(48, None, None))] + q_rot_28 = None + neg_56 = -x2_56 + x2_56 = None + cat_112 = torch.cat((neg_56, x1_56), dim=-1) + neg_56 = x1_56 = None + mul_255 = cat_112 * sin_28 + cat_112 = None + add_169 = mul_254 + mul_255 + mul_254 = mul_255 = None + q_embed_28 = torch.cat([add_169, q_pass_28], dim=-1) + add_169 = q_pass_28 = None + mul_256 = k_rot_28 * cos_28 + cos_28 = None + x1_57 = k_rot_28[(Ellipsis, slice(None, 48, None))] + x2_57 = k_rot_28[(Ellipsis, slice(48, None, None))] + k_rot_28 = None + neg_57 = -x2_57 + x2_57 = None + cat_114 = torch.cat((neg_57, x1_57), dim=-1) + neg_57 = x1_57 = None + mul_257 = cat_114 * sin_28 + cat_114 = sin_28 = None + add_170 = mul_256 + mul_257 + mul_256 = mul_257 = None + k_embed_28 = torch.cat([add_170, k_pass_28], dim=-1) + add_170 = k_pass_28 = None + getitem_459 = k_embed_28[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_311 = getitem_459.expand(1, 8, 3, 2, 128) + getitem_459 = None + key_56 = hidden_states_311.reshape(1, 24, 2, 128) + hidden_states_311 = None + getitem_460 = value_states_57[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_312 = getitem_460.expand(1, 8, 3, 2, 128) + getitem_460 = None + value_56 = hidden_states_312.reshape(1, 24, 2, 128) + hidden_states_312 = None + attention_mask_28 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_28 = q_embed_28.contiguous() + q_embed_28 = None + key_57 = key_56.contiguous() + key_56 = None + value_57 = value_56.contiguous() + value_56 = None + attn_output_112 = torch._C._nn.scaled_dot_product_attention( + query_28, + key_57, + value_57, + attn_mask=attention_mask_28, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_28 = key_57 = value_57 = attention_mask_28 = None + transpose_115 = attn_output_112.transpose(1, 2) + attn_output_112 = None + attn_output_113 = transpose_115.contiguous() + transpose_115 = None + reshape_86 = attn_output_113.reshape(1, 2, -1) + attn_output_113 = None + attn_output_114 = reshape_86.contiguous() + reshape_86 = None + attn_output_115 = torch._C._nn.linear( + attn_output_114, + l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_114 = l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_56 = torch.nn.functional.dropout(attn_output_115, 0.0, False, False) + attn_output_115 = None + hidden_states_313 = hidden_states_307 + dropout_56 + hidden_states_307 = dropout_56 = None + hidden_states_314 = hidden_states_313.to(torch.float32) + pow_58 = hidden_states_314.pow(2) + variance_57 = pow_58.mean(-1, keepdim=True) + pow_58 = None + add_172 = variance_57 + 1e-05 + variance_57 = None + rsqrt_57 = torch.rsqrt(add_172) + add_172 = None + hidden_states_315 = hidden_states_314 * rsqrt_57 + hidden_states_314 = rsqrt_57 = None + to_115 = hidden_states_315.to(torch.bfloat16) + hidden_states_315 = None + hidden_states_316 = ( + l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ + * to_115 + ) + l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ = ( + to_115 + ) = None + up_states_84 = torch._C._nn.linear( + hidden_states_316, + l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_316 = l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_28 = up_states_84.chunk(2, dim=-1) + up_states_84 = None + gate_28 = chunk_28[0] + up_states_85 = chunk_28[1] + chunk_28 = None + silu_28 = torch.nn.functional.silu(gate_28, inplace=False) + gate_28 = None + up_states_86 = up_states_85 * silu_28 + up_states_85 = silu_28 = None + hidden_states_317 = torch._C._nn.linear( + up_states_86, + l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_86 = l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_57 = torch.nn.functional.dropout(hidden_states_317, 0.0, False, False) + hidden_states_317 = None + hidden_states_318 = hidden_states_313 + dropout_57 + hidden_states_313 = dropout_57 = None + hidden_states_319 = hidden_states_318.to(torch.float32) + pow_59 = hidden_states_319.pow(2) + variance_58 = pow_59.mean(-1, keepdim=True) + pow_59 = None + add_174 = variance_58 + 1e-05 + variance_58 = None + rsqrt_58 = torch.rsqrt(add_174) + add_174 = None + hidden_states_320 = hidden_states_319 * rsqrt_58 + hidden_states_319 = rsqrt_58 = None + to_117 = hidden_states_320.to(torch.bfloat16) + hidden_states_320 = None + hidden_states_321 = ( + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ + * to_117 + ) + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ = ( + to_117 + ) = None + qkv_29 = torch._C._nn.linear( + hidden_states_321, + l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_321 = l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_58 = qkv_29[(Ellipsis, slice(None, 3072, None))] + key_states_58 = qkv_29[(Ellipsis, slice(3072, 4096, None))] + value_states_58 = qkv_29[(Ellipsis, slice(4096, None, None))] + qkv_29 = None + view_87 = query_states_58.view((1, 2, -1, 128)) + query_states_58 = None + query_states_59 = view_87.transpose(1, 2) + view_87 = None + view_88 = key_states_58.view((1, 2, -1, 128)) + key_states_58 = None + key_states_59 = view_88.transpose(1, 2) + view_88 = None + view_89 = value_states_58.view((1, 2, -1, 128)) + value_states_58 = None + value_states_59 = view_89.transpose(1, 2) + view_89 = None + cos_29 = l_stack0_0_.unsqueeze(1) + sin_29 = l_stack0_1_.unsqueeze(1) + q_rot_29 = query_states_59[(Ellipsis, slice(None, 96, None))] + q_pass_29 = query_states_59[(Ellipsis, slice(96, None, None))] + query_states_59 = None + k_rot_29 = key_states_59[(Ellipsis, slice(None, 96, None))] + k_pass_29 = key_states_59[(Ellipsis, slice(96, None, None))] + key_states_59 = None + mul_263 = q_rot_29 * cos_29 + x1_58 = q_rot_29[(Ellipsis, slice(None, 48, None))] + x2_58 = q_rot_29[(Ellipsis, slice(48, None, None))] + q_rot_29 = None + neg_58 = -x2_58 + x2_58 = None + cat_116 = torch.cat((neg_58, x1_58), dim=-1) + neg_58 = x1_58 = None + mul_264 = cat_116 * sin_29 + cat_116 = None + add_175 = mul_263 + mul_264 + mul_263 = mul_264 = None + q_embed_29 = torch.cat([add_175, q_pass_29], dim=-1) + add_175 = q_pass_29 = None + mul_265 = k_rot_29 * cos_29 + cos_29 = None + x1_59 = k_rot_29[(Ellipsis, slice(None, 48, None))] + x2_59 = k_rot_29[(Ellipsis, slice(48, None, None))] + k_rot_29 = None + neg_59 = -x2_59 + x2_59 = None + cat_118 = torch.cat((neg_59, x1_59), dim=-1) + neg_59 = x1_59 = None + mul_266 = cat_118 * sin_29 + cat_118 = sin_29 = None + add_176 = mul_265 + mul_266 + mul_265 = mul_266 = None + k_embed_29 = torch.cat([add_176, k_pass_29], dim=-1) + add_176 = k_pass_29 = None + getitem_475 = k_embed_29[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_322 = getitem_475.expand(1, 8, 3, 2, 128) + getitem_475 = None + key_58 = hidden_states_322.reshape(1, 24, 2, 128) + hidden_states_322 = None + getitem_476 = value_states_59[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_323 = getitem_476.expand(1, 8, 3, 2, 128) + getitem_476 = None + value_58 = hidden_states_323.reshape(1, 24, 2, 128) + hidden_states_323 = None + attention_mask_29 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_29 = q_embed_29.contiguous() + q_embed_29 = None + key_59 = key_58.contiguous() + key_58 = None + value_59 = value_58.contiguous() + value_58 = None + attn_output_116 = torch._C._nn.scaled_dot_product_attention( + query_29, + key_59, + value_59, + attn_mask=attention_mask_29, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_29 = key_59 = value_59 = attention_mask_29 = None + transpose_119 = attn_output_116.transpose(1, 2) + attn_output_116 = None + attn_output_117 = transpose_119.contiguous() + transpose_119 = None + reshape_89 = attn_output_117.reshape(1, 2, -1) + attn_output_117 = None + attn_output_118 = reshape_89.contiguous() + reshape_89 = None + attn_output_119 = torch._C._nn.linear( + attn_output_118, + l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_118 = l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_58 = torch.nn.functional.dropout(attn_output_119, 0.0, False, False) + attn_output_119 = None + hidden_states_324 = hidden_states_318 + dropout_58 + hidden_states_318 = dropout_58 = None + hidden_states_325 = hidden_states_324.to(torch.float32) + pow_60 = hidden_states_325.pow(2) + variance_59 = pow_60.mean(-1, keepdim=True) + pow_60 = None + add_178 = variance_59 + 1e-05 + variance_59 = None + rsqrt_59 = torch.rsqrt(add_178) + add_178 = None + hidden_states_326 = hidden_states_325 * rsqrt_59 + hidden_states_325 = rsqrt_59 = None + to_119 = hidden_states_326.to(torch.bfloat16) + hidden_states_326 = None + hidden_states_327 = ( + l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ + * to_119 + ) + l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ = ( + to_119 + ) = None + up_states_87 = torch._C._nn.linear( + hidden_states_327, + l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_327 = l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_29 = up_states_87.chunk(2, dim=-1) + up_states_87 = None + gate_29 = chunk_29[0] + up_states_88 = chunk_29[1] + chunk_29 = None + silu_29 = torch.nn.functional.silu(gate_29, inplace=False) + gate_29 = None + up_states_89 = up_states_88 * silu_29 + up_states_88 = silu_29 = None + hidden_states_328 = torch._C._nn.linear( + up_states_89, + l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_89 = l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_59 = torch.nn.functional.dropout(hidden_states_328, 0.0, False, False) + hidden_states_328 = None + hidden_states_329 = hidden_states_324 + dropout_59 + hidden_states_324 = dropout_59 = None + hidden_states_330 = hidden_states_329.to(torch.float32) + pow_61 = hidden_states_330.pow(2) + variance_60 = pow_61.mean(-1, keepdim=True) + pow_61 = None + add_180 = variance_60 + 1e-05 + variance_60 = None + rsqrt_60 = torch.rsqrt(add_180) + add_180 = None + hidden_states_331 = hidden_states_330 * rsqrt_60 + hidden_states_330 = rsqrt_60 = None + to_121 = hidden_states_331.to(torch.bfloat16) + hidden_states_331 = None + hidden_states_332 = ( + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ + * to_121 + ) + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ = ( + to_121 + ) = None + qkv_30 = torch._C._nn.linear( + hidden_states_332, + l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_332 = l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_60 = qkv_30[(Ellipsis, slice(None, 3072, None))] + key_states_60 = qkv_30[(Ellipsis, slice(3072, 4096, None))] + value_states_60 = qkv_30[(Ellipsis, slice(4096, None, None))] + qkv_30 = None + view_90 = query_states_60.view((1, 2, -1, 128)) + query_states_60 = None + query_states_61 = view_90.transpose(1, 2) + view_90 = None + view_91 = key_states_60.view((1, 2, -1, 128)) + key_states_60 = None + key_states_61 = view_91.transpose(1, 2) + view_91 = None + view_92 = value_states_60.view((1, 2, -1, 128)) + value_states_60 = None + value_states_61 = view_92.transpose(1, 2) + view_92 = None + cos_30 = l_stack0_0_.unsqueeze(1) + sin_30 = l_stack0_1_.unsqueeze(1) + q_rot_30 = query_states_61[(Ellipsis, slice(None, 96, None))] + q_pass_30 = query_states_61[(Ellipsis, slice(96, None, None))] + query_states_61 = None + k_rot_30 = key_states_61[(Ellipsis, slice(None, 96, None))] + k_pass_30 = key_states_61[(Ellipsis, slice(96, None, None))] + key_states_61 = None + mul_272 = q_rot_30 * cos_30 + x1_60 = q_rot_30[(Ellipsis, slice(None, 48, None))] + x2_60 = q_rot_30[(Ellipsis, slice(48, None, None))] + q_rot_30 = None + neg_60 = -x2_60 + x2_60 = None + cat_120 = torch.cat((neg_60, x1_60), dim=-1) + neg_60 = x1_60 = None + mul_273 = cat_120 * sin_30 + cat_120 = None + add_181 = mul_272 + mul_273 + mul_272 = mul_273 = None + q_embed_30 = torch.cat([add_181, q_pass_30], dim=-1) + add_181 = q_pass_30 = None + mul_274 = k_rot_30 * cos_30 + cos_30 = None + x1_61 = k_rot_30[(Ellipsis, slice(None, 48, None))] + x2_61 = k_rot_30[(Ellipsis, slice(48, None, None))] + k_rot_30 = None + neg_61 = -x2_61 + x2_61 = None + cat_122 = torch.cat((neg_61, x1_61), dim=-1) + neg_61 = x1_61 = None + mul_275 = cat_122 * sin_30 + cat_122 = sin_30 = None + add_182 = mul_274 + mul_275 + mul_274 = mul_275 = None + k_embed_30 = torch.cat([add_182, k_pass_30], dim=-1) + add_182 = k_pass_30 = None + getitem_491 = k_embed_30[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_333 = getitem_491.expand(1, 8, 3, 2, 128) + getitem_491 = None + key_60 = hidden_states_333.reshape(1, 24, 2, 128) + hidden_states_333 = None + getitem_492 = value_states_61[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_334 = getitem_492.expand(1, 8, 3, 2, 128) + getitem_492 = None + value_60 = hidden_states_334.reshape(1, 24, 2, 128) + hidden_states_334 = None + attention_mask_30 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_30 = q_embed_30.contiguous() + q_embed_30 = None + key_61 = key_60.contiguous() + key_60 = None + value_61 = value_60.contiguous() + value_60 = None + attn_output_120 = torch._C._nn.scaled_dot_product_attention( + query_30, + key_61, + value_61, + attn_mask=attention_mask_30, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_30 = key_61 = value_61 = attention_mask_30 = None + transpose_123 = attn_output_120.transpose(1, 2) + attn_output_120 = None + attn_output_121 = transpose_123.contiguous() + transpose_123 = None + reshape_92 = attn_output_121.reshape(1, 2, -1) + attn_output_121 = None + attn_output_122 = reshape_92.contiguous() + reshape_92 = None + attn_output_123 = torch._C._nn.linear( + attn_output_122, + l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_122 = l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_60 = torch.nn.functional.dropout(attn_output_123, 0.0, False, False) + attn_output_123 = None + hidden_states_335 = hidden_states_329 + dropout_60 + hidden_states_329 = dropout_60 = None + hidden_states_336 = hidden_states_335.to(torch.float32) + pow_62 = hidden_states_336.pow(2) + variance_61 = pow_62.mean(-1, keepdim=True) + pow_62 = None + add_184 = variance_61 + 1e-05 + variance_61 = None + rsqrt_61 = torch.rsqrt(add_184) + add_184 = None + hidden_states_337 = hidden_states_336 * rsqrt_61 + hidden_states_336 = rsqrt_61 = None + to_123 = hidden_states_337.to(torch.bfloat16) + hidden_states_337 = None + hidden_states_338 = ( + l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ + * to_123 + ) + l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ = ( + to_123 + ) = None + up_states_90 = torch._C._nn.linear( + hidden_states_338, + l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_338 = l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_30 = up_states_90.chunk(2, dim=-1) + up_states_90 = None + gate_30 = chunk_30[0] + up_states_91 = chunk_30[1] + chunk_30 = None + silu_30 = torch.nn.functional.silu(gate_30, inplace=False) + gate_30 = None + up_states_92 = up_states_91 * silu_30 + up_states_91 = silu_30 = None + hidden_states_339 = torch._C._nn.linear( + up_states_92, + l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_92 = l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_61 = torch.nn.functional.dropout(hidden_states_339, 0.0, False, False) + hidden_states_339 = None + hidden_states_340 = hidden_states_335 + dropout_61 + hidden_states_335 = dropout_61 = None + hidden_states_341 = hidden_states_340.to(torch.float32) + pow_63 = hidden_states_341.pow(2) + variance_62 = pow_63.mean(-1, keepdim=True) + pow_63 = None + add_186 = variance_62 + 1e-05 + variance_62 = None + rsqrt_62 = torch.rsqrt(add_186) + add_186 = None + hidden_states_342 = hidden_states_341 * rsqrt_62 + hidden_states_341 = rsqrt_62 = None + to_125 = hidden_states_342.to(torch.bfloat16) + hidden_states_342 = None + hidden_states_343 = ( + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ + * to_125 + ) + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ = ( + to_125 + ) = None + qkv_31 = torch._C._nn.linear( + hidden_states_343, + l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_, + None, + ) + hidden_states_343 = l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ = (None) + query_states_62 = qkv_31[(Ellipsis, slice(None, 3072, None))] + key_states_62 = qkv_31[(Ellipsis, slice(3072, 4096, None))] + value_states_62 = qkv_31[(Ellipsis, slice(4096, None, None))] + qkv_31 = None + view_93 = query_states_62.view((1, 2, -1, 128)) + query_states_62 = None + query_states_63 = view_93.transpose(1, 2) + view_93 = None + view_94 = key_states_62.view((1, 2, -1, 128)) + key_states_62 = None + key_states_63 = view_94.transpose(1, 2) + view_94 = None + view_95 = value_states_62.view((1, 2, -1, 128)) + value_states_62 = None + value_states_63 = view_95.transpose(1, 2) + view_95 = None + cos_31 = l_stack0_0_.unsqueeze(1) + l_stack0_0_ = None + sin_31 = l_stack0_1_.unsqueeze(1) + l_stack0_1_ = None + q_rot_31 = query_states_63[(Ellipsis, slice(None, 96, None))] + q_pass_31 = query_states_63[(Ellipsis, slice(96, None, None))] + query_states_63 = None + k_rot_31 = key_states_63[(Ellipsis, slice(None, 96, None))] + k_pass_31 = key_states_63[(Ellipsis, slice(96, None, None))] + key_states_63 = None + mul_281 = q_rot_31 * cos_31 + x1_62 = q_rot_31[(Ellipsis, slice(None, 48, None))] + x2_62 = q_rot_31[(Ellipsis, slice(48, None, None))] + q_rot_31 = None + neg_62 = -x2_62 + x2_62 = None + cat_124 = torch.cat((neg_62, x1_62), dim=-1) + neg_62 = x1_62 = None + mul_282 = cat_124 * sin_31 + cat_124 = None + add_187 = mul_281 + mul_282 + mul_281 = mul_282 = None + q_embed_31 = torch.cat([add_187, q_pass_31], dim=-1) + add_187 = q_pass_31 = None + mul_283 = k_rot_31 * cos_31 + cos_31 = None + x1_63 = k_rot_31[(Ellipsis, slice(None, 48, None))] + x2_63 = k_rot_31[(Ellipsis, slice(48, None, None))] + k_rot_31 = None + neg_63 = -x2_63 + x2_63 = None + cat_126 = torch.cat((neg_63, x1_63), dim=-1) + neg_63 = x1_63 = None + mul_284 = cat_126 * sin_31 + cat_126 = sin_31 = None + add_188 = mul_283 + mul_284 + mul_283 = mul_284 = None + k_embed_31 = torch.cat([add_188, k_pass_31], dim=-1) + add_188 = k_pass_31 = None + getitem_507 = k_embed_31[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_344 = getitem_507.expand(1, 8, 3, 2, 128) + getitem_507 = None + key_62 = hidden_states_344.reshape(1, 24, 2, 128) + hidden_states_344 = None + getitem_508 = value_states_63[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + hidden_states_345 = getitem_508.expand(1, 8, 3, 2, 128) + getitem_508 = None + value_62 = hidden_states_345.reshape(1, 24, 2, 128) + hidden_states_345 = None + attention_mask_31 = l_causal_mask_[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + l_causal_mask_ = None + query_31 = q_embed_31.contiguous() + q_embed_31 = None + key_63 = key_62.contiguous() + key_62 = None + value_63 = value_62.contiguous() + value_62 = None + attn_output_124 = torch._C._nn.scaled_dot_product_attention( + query_31, + key_63, + value_63, + attn_mask=attention_mask_31, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_31 = key_63 = value_63 = attention_mask_31 = None + transpose_127 = attn_output_124.transpose(1, 2) + attn_output_124 = None + attn_output_125 = transpose_127.contiguous() + transpose_127 = None + reshape_95 = attn_output_125.reshape(1, 2, -1) + attn_output_125 = None + attn_output_126 = reshape_95.contiguous() + reshape_95 = None + attn_output_127 = torch._C._nn.linear( + attn_output_126, + l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_126 = l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + dropout_62 = torch.nn.functional.dropout(attn_output_127, 0.0, False, False) + attn_output_127 = None + hidden_states_346 = hidden_states_340 + dropout_62 + hidden_states_340 = dropout_62 = None + hidden_states_347 = hidden_states_346.to(torch.float32) + pow_64 = hidden_states_347.pow(2) + variance_63 = pow_64.mean(-1, keepdim=True) + pow_64 = None + add_190 = variance_63 + 1e-05 + variance_63 = None + rsqrt_63 = torch.rsqrt(add_190) + add_190 = None + hidden_states_348 = hidden_states_347 * rsqrt_63 + hidden_states_347 = rsqrt_63 = None + to_127 = hidden_states_348.to(torch.bfloat16) + hidden_states_348 = None + hidden_states_349 = ( + l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ + * to_127 + ) + l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ = ( + to_127 + ) = None + up_states_93 = torch._C._nn.linear( + hidden_states_349, + l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_, + None, + ) + hidden_states_349 = l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ = (None) + chunk_31 = up_states_93.chunk(2, dim=-1) + up_states_93 = None + gate_31 = chunk_31[0] + up_states_94 = chunk_31[1] + chunk_31 = None + silu_31 = torch.nn.functional.silu(gate_31, inplace=False) + gate_31 = None + up_states_95 = up_states_94 * silu_31 + up_states_94 = silu_31 = None + hidden_states_350 = torch._C._nn.linear( + up_states_95, + l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + up_states_95 = l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ = (None) + dropout_63 = torch.nn.functional.dropout(hidden_states_350, 0.0, False, False) + hidden_states_350 = None + hidden_states_351 = hidden_states_346 + dropout_63 + hidden_states_346 = dropout_63 = None + hidden_states_352 = hidden_states_351.to(torch.float32) + hidden_states_351 = None + pow_65 = hidden_states_352.pow(2) + variance_64 = pow_65.mean(-1, keepdim=True) + pow_65 = None + add_192 = variance_64 + 1e-05 + variance_64 = None + rsqrt_64 = torch.rsqrt(add_192) + add_192 = None + hidden_states_353 = hidden_states_352 * rsqrt_64 + hidden_states_352 = rsqrt_64 = None + to_129 = hidden_states_353.to(torch.bfloat16) + hidden_states_353 = None + hidden_states_354 = l_self_modules_norm_parameters_weight_ * to_129 + l_self_modules_norm_parameters_weight_ = to_129 = None + return ( + value_states_1, + k_embed, + value_states_3, + k_embed_1, + value_states_5, + k_embed_2, + value_states_7, + k_embed_3, + value_states_9, + k_embed_4, + value_states_11, + k_embed_5, + value_states_13, + k_embed_6, + value_states_15, + k_embed_7, + value_states_17, + k_embed_8, + value_states_19, + k_embed_9, + value_states_21, + k_embed_10, + value_states_23, + k_embed_11, + value_states_25, + k_embed_12, + value_states_27, + k_embed_13, + value_states_29, + k_embed_14, + value_states_31, + k_embed_15, + value_states_33, + k_embed_16, + value_states_35, + k_embed_17, + value_states_37, + k_embed_18, + value_states_39, + k_embed_19, + value_states_41, + k_embed_20, + value_states_43, + k_embed_21, + value_states_45, + k_embed_22, + value_states_47, + k_embed_23, + value_states_49, + k_embed_24, + value_states_51, + k_embed_25, + value_states_53, + k_embed_26, + value_states_55, + k_embed_27, + value_states_57, + k_embed_28, + value_states_59, + k_embed_29, + value_states_61, + k_embed_30, + value_states_63, + k_embed_31, + hidden_states_354, + ) diff --git a/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/weight_meta.py b/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/weight_meta.py new file mode 100644 index 000000000..99a0407e0 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/Phi-4-mini-instruct/weight_meta.py @@ -0,0 +1,1968 @@ +class Program_weight_tensor_meta_L_hidden_states_: + name = "L_hidden_states_" + shape = [1, 2, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_stack0_0_: + name = "L_stack0_0_" + shape = [1, 2, 96] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.172 + std = 0.075 + data = None + + +class Program_weight_tensor_meta_L_stack0_1_: + name = "L_stack0_1_" + shape = [1, 2, 96] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.066 + std = 0.184 + data = None + + +class Program_weight_tensor_meta_L_causal_mask_: + name = "L_causal_mask_" + shape = [1, 1, 2, 2] + dtype = "torch.bool" + device = "cuda:0" + mean = None + std = None + data = [True, False, True, True] + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_" + shape = [5120, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [3072, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_" + shape = [16384, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_" + shape = [3072, 8192] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_norm_parameters_weight_: + name = "L_self_modules_norm_parameters_weight_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/microsoft/phi-1/graph_hash.txt b/samples/transformers-auto-model/microsoft/phi-1/graph_hash.txt new file mode 100644 index 000000000..a93a3d88f --- /dev/null +++ b/samples/transformers-auto-model/microsoft/phi-1/graph_hash.txt @@ -0,0 +1 @@ +18fdff0b8ef80edb252dce2c27c15258c0e24b18c8e21f7cd5242df4ae4d2479 \ No newline at end of file diff --git a/samples/transformers-auto-model/microsoft/phi-1/graph_net.json b/samples/transformers-auto-model/microsoft/phi-1/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/phi-1/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/microsoft/phi-1/input_meta.py b/samples/transformers-auto-model/microsoft/phi-1/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/microsoft/phi-1/input_tensor_constraints.py b/samples/transformers-auto-model/microsoft/phi-1/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/microsoft/phi-1/model.py b/samples/transformers-auto-model/microsoft/phi-1/model.py new file mode 100644 index 000000000..d4c6319b7 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/phi-1/model.py @@ -0,0 +1,4894 @@ +import torch + +from torch import device + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_inputs_embeds_: torch.Tensor, + L_attention_mask_: torch.Tensor, + L_self_modules_rotary_emb_buffers_inv_freq_: torch.Tensor, + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_final_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_final_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_inputs_embeds_ = L_inputs_embeds_ + l_attention_mask_ = L_attention_mask_ + l_self_modules_rotary_emb_buffers_inv_freq_ = ( + L_self_modules_rotary_emb_buffers_inv_freq_ + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_final_layernorm_parameters_weight_ = ( + L_self_modules_final_layernorm_parameters_weight_ + ) + l_self_modules_final_layernorm_parameters_bias_ = ( + L_self_modules_final_layernorm_parameters_bias_ + ) + cache_position = torch.arange(0, 2, device=device(type="cuda", index=0)) + position_ids = cache_position.unsqueeze(0) + attention_mask = l_attention_mask_.to( + device=device(type="cuda", index=0), dtype=torch.bool + ) + l_attention_mask_ = None + mask_indices = torch.arange(2, device=device(type="cuda", index=0)) + mask_indices += 0 + mask_indices_1 = mask_indices + mask_indices = None + local_padding_mask = attention_mask[(slice(None, None, None), mask_indices_1)] + attention_mask = mask_indices_1 = None + kv_arange = torch.arange(2, device=device(type="cuda", index=0)) + kv_arange += 0 + kv_arange_1 = kv_arange + kv_arange = None + reshaped_cache_position = cache_position.view(-1, 1) + cache_position = None + causal_mask = kv_arange_1 <= reshaped_cache_position + kv_arange_1 = reshaped_cache_position = None + getitem_1 = causal_mask[ + (None, None, slice(None, None, None), slice(None, None, None)) + ] + causal_mask = None + causal_mask_1 = getitem_1.expand(1, -1, -1, -1) + getitem_1 = None + getitem_2 = local_padding_mask[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + local_padding_mask = None + causal_mask_2 = causal_mask_1 * getitem_2 + causal_mask_1 = getitem_2 = None + inputs_embeds = torch.nn.functional.dropout(l_inputs_embeds_, 0.0, False, False) + l_inputs_embeds_ = None + _set_grad_enabled = torch._C._set_grad_enabled(False) + _set_grad_enabled = None + getitem_3 = l_self_modules_rotary_emb_buffers_inv_freq_[ + (None, slice(None, None, None), None) + ] + l_self_modules_rotary_emb_buffers_inv_freq_ = None + float_1 = getitem_3.float() + getitem_3 = None + expand_1 = float_1.expand(1, -1, 1) + float_1 = None + inv_freq_expanded = expand_1.to(device(type="cuda", index=0)) + expand_1 = None + getitem_4 = position_ids[ + (slice(None, None, None), None, slice(None, None, None)) + ] + position_ids = None + position_ids_expanded = getitem_4.float() + getitem_4 = None + float_3 = inv_freq_expanded.float() + inv_freq_expanded = None + float_4 = position_ids_expanded.float() + position_ids_expanded = None + matmul = float_3 @ float_4 + float_3 = float_4 = None + freqs = matmul.transpose(1, 2) + matmul = None + emb = torch.cat((freqs, freqs), dim=-1) + freqs = None + cos = emb.cos() + cos_1 = cos * 1.0 + cos = None + sin = emb.sin() + emb = None + sin_1 = sin * 1.0 + sin = None + cos_2 = cos_1.to(dtype=torch.float32) + cos_1 = None + sin_2 = sin_1.to(dtype=torch.float32) + sin_1 = None + _set_grad_enabled_1 = torch._C._set_grad_enabled(True) + _set_grad_enabled_1 = None + _log_api_usage_once = torch._C._log_api_usage_once("python.nn_module") + _log_api_usage_once = None + hidden_states = torch.nn.functional.layer_norm( + inputs_embeds, + (2048,), + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_ + ) = None + linear = torch._C._nn.linear( + hidden_states, + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_1 = linear.view((1, 2, -1, 64)) + linear = None + query_states = view_1.transpose(1, 2) + view_1 = None + linear_1 = torch._C._nn.linear( + hidden_states, + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_2 = linear_1.view((1, 2, -1, 64)) + linear_1 = None + key_states = view_2.transpose(1, 2) + view_2 = None + linear_2 = torch._C._nn.linear( + hidden_states, + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_3 = linear_2.view((1, 2, -1, 64)) + linear_2 = None + value_states = view_3.transpose(1, 2) + view_3 = None + query_rot = query_states[(Ellipsis, slice(None, 32, None))] + query_pass = query_states[(Ellipsis, slice(32, None, None))] + query_states = None + key_rot = key_states[(Ellipsis, slice(None, 32, None))] + key_pass = key_states[(Ellipsis, slice(32, None, None))] + key_states = None + cos_3 = cos_2.unsqueeze(1) + sin_3 = sin_2.unsqueeze(1) + mul_3 = query_rot * cos_3 + x1 = query_rot[(Ellipsis, slice(None, 16, None))] + x2 = query_rot[(Ellipsis, slice(16, None, None))] + query_rot = None + neg = -x2 + x2 = None + cat_1 = torch.cat((neg, x1), dim=-1) + neg = x1 = None + mul_4 = cat_1 * sin_3 + cat_1 = None + q_embed = mul_3 + mul_4 + mul_3 = mul_4 = None + mul_5 = key_rot * cos_3 + cos_3 = None + x1_1 = key_rot[(Ellipsis, slice(None, 16, None))] + x2_1 = key_rot[(Ellipsis, slice(16, None, None))] + key_rot = None + neg_1 = -x2_1 + x2_1 = None + cat_2 = torch.cat((neg_1, x1_1), dim=-1) + neg_1 = x1_1 = None + mul_6 = cat_2 * sin_3 + cat_2 = sin_3 = None + k_embed = mul_5 + mul_6 + mul_5 = mul_6 = None + query_states_1 = torch.cat((q_embed, query_pass), dim=-1) + q_embed = query_pass = None + key_states_1 = torch.cat((k_embed, key_pass), dim=-1) + k_embed = key_pass = None + attention_mask_1 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query = query_states_1.contiguous() + query_states_1 = None + key = key_states_1.contiguous() + value = value_states.contiguous() + attn_output = torch._C._nn.scaled_dot_product_attention( + query, + key, + value, + attn_mask=attention_mask_1, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query = key = value = attention_mask_1 = None + transpose_4 = attn_output.transpose(1, 2) + attn_output = None + attn_output_1 = transpose_4.contiguous() + transpose_4 = None + reshape = attn_output_1.reshape(1, 2, -1) + attn_output_1 = None + attn_output_2 = reshape.contiguous() + reshape = None + attn_output_3 = torch._C._nn.linear( + attn_output_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_2 = l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs = torch.nn.functional.dropout(attn_output_3, 0.0, False, False) + attn_output_3 = None + hidden_states_1 = torch._C._nn.linear( + hidden_states, + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states = ( + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_7 = 0.5 * hidden_states_1 + pow_1 = torch.pow(hidden_states_1, 3.0) + mul_8 = 0.044715 * pow_1 + pow_1 = None + add_2 = hidden_states_1 + mul_8 + hidden_states_1 = mul_8 = None + mul_9 = 0.7978845608028654 * add_2 + add_2 = None + tanh = torch.tanh(mul_9) + mul_9 = None + add_3 = 1.0 + tanh + tanh = None + hidden_states_2 = mul_7 * add_3 + mul_7 = add_3 = None + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_2 = ( + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states = torch.nn.functional.dropout( + hidden_states_3, 0.0, False, False + ) + hidden_states_3 = None + add_4 = attn_outputs + feed_forward_hidden_states + attn_outputs = feed_forward_hidden_states = None + hidden_states_4 = add_4 + inputs_embeds + add_4 = inputs_embeds = None + hidden_states_5 = torch.nn.functional.layer_norm( + hidden_states_4, + (2048,), + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_ + ) = None + linear_6 = torch._C._nn.linear( + hidden_states_5, + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_4 = linear_6.view((1, 2, -1, 64)) + linear_6 = None + query_states_2 = view_4.transpose(1, 2) + view_4 = None + linear_7 = torch._C._nn.linear( + hidden_states_5, + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_5 = linear_7.view((1, 2, -1, 64)) + linear_7 = None + key_states_2 = view_5.transpose(1, 2) + view_5 = None + linear_8 = torch._C._nn.linear( + hidden_states_5, + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_6 = linear_8.view((1, 2, -1, 64)) + linear_8 = None + value_states_1 = view_6.transpose(1, 2) + view_6 = None + query_rot_1 = query_states_2[(Ellipsis, slice(None, 32, None))] + query_pass_1 = query_states_2[(Ellipsis, slice(32, None, None))] + query_states_2 = None + key_rot_1 = key_states_2[(Ellipsis, slice(None, 32, None))] + key_pass_1 = key_states_2[(Ellipsis, slice(32, None, None))] + key_states_2 = None + cos_4 = cos_2.unsqueeze(1) + sin_4 = sin_2.unsqueeze(1) + mul_11 = query_rot_1 * cos_4 + x1_2 = query_rot_1[(Ellipsis, slice(None, 16, None))] + x2_2 = query_rot_1[(Ellipsis, slice(16, None, None))] + query_rot_1 = None + neg_2 = -x2_2 + x2_2 = None + cat_5 = torch.cat((neg_2, x1_2), dim=-1) + neg_2 = x1_2 = None + mul_12 = cat_5 * sin_4 + cat_5 = None + q_embed_1 = mul_11 + mul_12 + mul_11 = mul_12 = None + mul_13 = key_rot_1 * cos_4 + cos_4 = None + x1_3 = key_rot_1[(Ellipsis, slice(None, 16, None))] + x2_3 = key_rot_1[(Ellipsis, slice(16, None, None))] + key_rot_1 = None + neg_3 = -x2_3 + x2_3 = None + cat_6 = torch.cat((neg_3, x1_3), dim=-1) + neg_3 = x1_3 = None + mul_14 = cat_6 * sin_4 + cat_6 = sin_4 = None + k_embed_1 = mul_13 + mul_14 + mul_13 = mul_14 = None + query_states_3 = torch.cat((q_embed_1, query_pass_1), dim=-1) + q_embed_1 = query_pass_1 = None + key_states_3 = torch.cat((k_embed_1, key_pass_1), dim=-1) + k_embed_1 = key_pass_1 = None + attention_mask_2 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_1 = query_states_3.contiguous() + query_states_3 = None + key_1 = key_states_3.contiguous() + value_1 = value_states_1.contiguous() + attn_output_4 = torch._C._nn.scaled_dot_product_attention( + query_1, + key_1, + value_1, + attn_mask=attention_mask_2, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_1 = key_1 = value_1 = attention_mask_2 = None + transpose_8 = attn_output_4.transpose(1, 2) + attn_output_4 = None + attn_output_5 = transpose_8.contiguous() + transpose_8 = None + reshape_1 = attn_output_5.reshape(1, 2, -1) + attn_output_5 = None + attn_output_6 = reshape_1.contiguous() + reshape_1 = None + attn_output_7 = torch._C._nn.linear( + attn_output_6, + l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_6 = l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_1 = torch.nn.functional.dropout(attn_output_7, 0.0, False, False) + attn_output_7 = None + hidden_states_6 = torch._C._nn.linear( + hidden_states_5, + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_5 = ( + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_15 = 0.5 * hidden_states_6 + pow_2 = torch.pow(hidden_states_6, 3.0) + mul_16 = 0.044715 * pow_2 + pow_2 = None + add_8 = hidden_states_6 + mul_16 + hidden_states_6 = mul_16 = None + mul_17 = 0.7978845608028654 * add_8 + add_8 = None + tanh_1 = torch.tanh(mul_17) + mul_17 = None + add_9 = 1.0 + tanh_1 + tanh_1 = None + hidden_states_7 = mul_15 * add_9 + mul_15 = add_9 = None + hidden_states_8 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_7 = ( + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_1 = torch.nn.functional.dropout( + hidden_states_8, 0.0, False, False + ) + hidden_states_8 = None + add_10 = attn_outputs_1 + feed_forward_hidden_states_1 + attn_outputs_1 = feed_forward_hidden_states_1 = None + hidden_states_9 = add_10 + hidden_states_4 + add_10 = hidden_states_4 = None + hidden_states_10 = torch.nn.functional.layer_norm( + hidden_states_9, + (2048,), + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_ + ) = None + linear_12 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_7 = linear_12.view((1, 2, -1, 64)) + linear_12 = None + query_states_4 = view_7.transpose(1, 2) + view_7 = None + linear_13 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_8 = linear_13.view((1, 2, -1, 64)) + linear_13 = None + key_states_4 = view_8.transpose(1, 2) + view_8 = None + linear_14 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_9 = linear_14.view((1, 2, -1, 64)) + linear_14 = None + value_states_2 = view_9.transpose(1, 2) + view_9 = None + query_rot_2 = query_states_4[(Ellipsis, slice(None, 32, None))] + query_pass_2 = query_states_4[(Ellipsis, slice(32, None, None))] + query_states_4 = None + key_rot_2 = key_states_4[(Ellipsis, slice(None, 32, None))] + key_pass_2 = key_states_4[(Ellipsis, slice(32, None, None))] + key_states_4 = None + cos_5 = cos_2.unsqueeze(1) + sin_5 = sin_2.unsqueeze(1) + mul_19 = query_rot_2 * cos_5 + x1_4 = query_rot_2[(Ellipsis, slice(None, 16, None))] + x2_4 = query_rot_2[(Ellipsis, slice(16, None, None))] + query_rot_2 = None + neg_4 = -x2_4 + x2_4 = None + cat_9 = torch.cat((neg_4, x1_4), dim=-1) + neg_4 = x1_4 = None + mul_20 = cat_9 * sin_5 + cat_9 = None + q_embed_2 = mul_19 + mul_20 + mul_19 = mul_20 = None + mul_21 = key_rot_2 * cos_5 + cos_5 = None + x1_5 = key_rot_2[(Ellipsis, slice(None, 16, None))] + x2_5 = key_rot_2[(Ellipsis, slice(16, None, None))] + key_rot_2 = None + neg_5 = -x2_5 + x2_5 = None + cat_10 = torch.cat((neg_5, x1_5), dim=-1) + neg_5 = x1_5 = None + mul_22 = cat_10 * sin_5 + cat_10 = sin_5 = None + k_embed_2 = mul_21 + mul_22 + mul_21 = mul_22 = None + query_states_5 = torch.cat((q_embed_2, query_pass_2), dim=-1) + q_embed_2 = query_pass_2 = None + key_states_5 = torch.cat((k_embed_2, key_pass_2), dim=-1) + k_embed_2 = key_pass_2 = None + attention_mask_3 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_2 = query_states_5.contiguous() + query_states_5 = None + key_2 = key_states_5.contiguous() + value_2 = value_states_2.contiguous() + attn_output_8 = torch._C._nn.scaled_dot_product_attention( + query_2, + key_2, + value_2, + attn_mask=attention_mask_3, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_2 = key_2 = value_2 = attention_mask_3 = None + transpose_12 = attn_output_8.transpose(1, 2) + attn_output_8 = None + attn_output_9 = transpose_12.contiguous() + transpose_12 = None + reshape_2 = attn_output_9.reshape(1, 2, -1) + attn_output_9 = None + attn_output_10 = reshape_2.contiguous() + reshape_2 = None + attn_output_11 = torch._C._nn.linear( + attn_output_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_10 = l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_2 = torch.nn.functional.dropout(attn_output_11, 0.0, False, False) + attn_output_11 = None + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_10 = ( + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_23 = 0.5 * hidden_states_11 + pow_3 = torch.pow(hidden_states_11, 3.0) + mul_24 = 0.044715 * pow_3 + pow_3 = None + add_14 = hidden_states_11 + mul_24 + hidden_states_11 = mul_24 = None + mul_25 = 0.7978845608028654 * add_14 + add_14 = None + tanh_2 = torch.tanh(mul_25) + mul_25 = None + add_15 = 1.0 + tanh_2 + tanh_2 = None + hidden_states_12 = mul_23 * add_15 + mul_23 = add_15 = None + hidden_states_13 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_12 = ( + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_2 = torch.nn.functional.dropout( + hidden_states_13, 0.0, False, False + ) + hidden_states_13 = None + add_16 = attn_outputs_2 + feed_forward_hidden_states_2 + attn_outputs_2 = feed_forward_hidden_states_2 = None + hidden_states_14 = add_16 + hidden_states_9 + add_16 = hidden_states_9 = None + hidden_states_15 = torch.nn.functional.layer_norm( + hidden_states_14, + (2048,), + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_ + ) = None + linear_18 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_10 = linear_18.view((1, 2, -1, 64)) + linear_18 = None + query_states_6 = view_10.transpose(1, 2) + view_10 = None + linear_19 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_11 = linear_19.view((1, 2, -1, 64)) + linear_19 = None + key_states_6 = view_11.transpose(1, 2) + view_11 = None + linear_20 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_12 = linear_20.view((1, 2, -1, 64)) + linear_20 = None + value_states_3 = view_12.transpose(1, 2) + view_12 = None + query_rot_3 = query_states_6[(Ellipsis, slice(None, 32, None))] + query_pass_3 = query_states_6[(Ellipsis, slice(32, None, None))] + query_states_6 = None + key_rot_3 = key_states_6[(Ellipsis, slice(None, 32, None))] + key_pass_3 = key_states_6[(Ellipsis, slice(32, None, None))] + key_states_6 = None + cos_6 = cos_2.unsqueeze(1) + sin_6 = sin_2.unsqueeze(1) + mul_27 = query_rot_3 * cos_6 + x1_6 = query_rot_3[(Ellipsis, slice(None, 16, None))] + x2_6 = query_rot_3[(Ellipsis, slice(16, None, None))] + query_rot_3 = None + neg_6 = -x2_6 + x2_6 = None + cat_13 = torch.cat((neg_6, x1_6), dim=-1) + neg_6 = x1_6 = None + mul_28 = cat_13 * sin_6 + cat_13 = None + q_embed_3 = mul_27 + mul_28 + mul_27 = mul_28 = None + mul_29 = key_rot_3 * cos_6 + cos_6 = None + x1_7 = key_rot_3[(Ellipsis, slice(None, 16, None))] + x2_7 = key_rot_3[(Ellipsis, slice(16, None, None))] + key_rot_3 = None + neg_7 = -x2_7 + x2_7 = None + cat_14 = torch.cat((neg_7, x1_7), dim=-1) + neg_7 = x1_7 = None + mul_30 = cat_14 * sin_6 + cat_14 = sin_6 = None + k_embed_3 = mul_29 + mul_30 + mul_29 = mul_30 = None + query_states_7 = torch.cat((q_embed_3, query_pass_3), dim=-1) + q_embed_3 = query_pass_3 = None + key_states_7 = torch.cat((k_embed_3, key_pass_3), dim=-1) + k_embed_3 = key_pass_3 = None + attention_mask_4 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_3 = query_states_7.contiguous() + query_states_7 = None + key_3 = key_states_7.contiguous() + value_3 = value_states_3.contiguous() + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_3, + key_3, + value_3, + attn_mask=attention_mask_4, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_3 = key_3 = value_3 = attention_mask_4 = None + transpose_16 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_13 = transpose_16.contiguous() + transpose_16 = None + reshape_3 = attn_output_13.reshape(1, 2, -1) + attn_output_13 = None + attn_output_14 = reshape_3.contiguous() + reshape_3 = None + attn_output_15 = torch._C._nn.linear( + attn_output_14, + l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_14 = l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_3 = torch.nn.functional.dropout(attn_output_15, 0.0, False, False) + attn_output_15 = None + hidden_states_16 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_15 = ( + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_31 = 0.5 * hidden_states_16 + pow_4 = torch.pow(hidden_states_16, 3.0) + mul_32 = 0.044715 * pow_4 + pow_4 = None + add_20 = hidden_states_16 + mul_32 + hidden_states_16 = mul_32 = None + mul_33 = 0.7978845608028654 * add_20 + add_20 = None + tanh_3 = torch.tanh(mul_33) + mul_33 = None + add_21 = 1.0 + tanh_3 + tanh_3 = None + hidden_states_17 = mul_31 * add_21 + mul_31 = add_21 = None + hidden_states_18 = torch._C._nn.linear( + hidden_states_17, + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_17 = ( + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_3 = torch.nn.functional.dropout( + hidden_states_18, 0.0, False, False + ) + hidden_states_18 = None + add_22 = attn_outputs_3 + feed_forward_hidden_states_3 + attn_outputs_3 = feed_forward_hidden_states_3 = None + hidden_states_19 = add_22 + hidden_states_14 + add_22 = hidden_states_14 = None + hidden_states_20 = torch.nn.functional.layer_norm( + hidden_states_19, + (2048,), + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_ + ) = None + linear_24 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_13 = linear_24.view((1, 2, -1, 64)) + linear_24 = None + query_states_8 = view_13.transpose(1, 2) + view_13 = None + linear_25 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_14 = linear_25.view((1, 2, -1, 64)) + linear_25 = None + key_states_8 = view_14.transpose(1, 2) + view_14 = None + linear_26 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_15 = linear_26.view((1, 2, -1, 64)) + linear_26 = None + value_states_4 = view_15.transpose(1, 2) + view_15 = None + query_rot_4 = query_states_8[(Ellipsis, slice(None, 32, None))] + query_pass_4 = query_states_8[(Ellipsis, slice(32, None, None))] + query_states_8 = None + key_rot_4 = key_states_8[(Ellipsis, slice(None, 32, None))] + key_pass_4 = key_states_8[(Ellipsis, slice(32, None, None))] + key_states_8 = None + cos_7 = cos_2.unsqueeze(1) + sin_7 = sin_2.unsqueeze(1) + mul_35 = query_rot_4 * cos_7 + x1_8 = query_rot_4[(Ellipsis, slice(None, 16, None))] + x2_8 = query_rot_4[(Ellipsis, slice(16, None, None))] + query_rot_4 = None + neg_8 = -x2_8 + x2_8 = None + cat_17 = torch.cat((neg_8, x1_8), dim=-1) + neg_8 = x1_8 = None + mul_36 = cat_17 * sin_7 + cat_17 = None + q_embed_4 = mul_35 + mul_36 + mul_35 = mul_36 = None + mul_37 = key_rot_4 * cos_7 + cos_7 = None + x1_9 = key_rot_4[(Ellipsis, slice(None, 16, None))] + x2_9 = key_rot_4[(Ellipsis, slice(16, None, None))] + key_rot_4 = None + neg_9 = -x2_9 + x2_9 = None + cat_18 = torch.cat((neg_9, x1_9), dim=-1) + neg_9 = x1_9 = None + mul_38 = cat_18 * sin_7 + cat_18 = sin_7 = None + k_embed_4 = mul_37 + mul_38 + mul_37 = mul_38 = None + query_states_9 = torch.cat((q_embed_4, query_pass_4), dim=-1) + q_embed_4 = query_pass_4 = None + key_states_9 = torch.cat((k_embed_4, key_pass_4), dim=-1) + k_embed_4 = key_pass_4 = None + attention_mask_5 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_4 = query_states_9.contiguous() + query_states_9 = None + key_4 = key_states_9.contiguous() + value_4 = value_states_4.contiguous() + attn_output_16 = torch._C._nn.scaled_dot_product_attention( + query_4, + key_4, + value_4, + attn_mask=attention_mask_5, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_4 = key_4 = value_4 = attention_mask_5 = None + transpose_20 = attn_output_16.transpose(1, 2) + attn_output_16 = None + attn_output_17 = transpose_20.contiguous() + transpose_20 = None + reshape_4 = attn_output_17.reshape(1, 2, -1) + attn_output_17 = None + attn_output_18 = reshape_4.contiguous() + reshape_4 = None + attn_output_19 = torch._C._nn.linear( + attn_output_18, + l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_18 = l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_4 = torch.nn.functional.dropout(attn_output_19, 0.0, False, False) + attn_output_19 = None + hidden_states_21 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_20 = ( + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_39 = 0.5 * hidden_states_21 + pow_5 = torch.pow(hidden_states_21, 3.0) + mul_40 = 0.044715 * pow_5 + pow_5 = None + add_26 = hidden_states_21 + mul_40 + hidden_states_21 = mul_40 = None + mul_41 = 0.7978845608028654 * add_26 + add_26 = None + tanh_4 = torch.tanh(mul_41) + mul_41 = None + add_27 = 1.0 + tanh_4 + tanh_4 = None + hidden_states_22 = mul_39 * add_27 + mul_39 = add_27 = None + hidden_states_23 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_22 = ( + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_4 = torch.nn.functional.dropout( + hidden_states_23, 0.0, False, False + ) + hidden_states_23 = None + add_28 = attn_outputs_4 + feed_forward_hidden_states_4 + attn_outputs_4 = feed_forward_hidden_states_4 = None + hidden_states_24 = add_28 + hidden_states_19 + add_28 = hidden_states_19 = None + hidden_states_25 = torch.nn.functional.layer_norm( + hidden_states_24, + (2048,), + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_ + ) = None + linear_30 = torch._C._nn.linear( + hidden_states_25, + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_16 = linear_30.view((1, 2, -1, 64)) + linear_30 = None + query_states_10 = view_16.transpose(1, 2) + view_16 = None + linear_31 = torch._C._nn.linear( + hidden_states_25, + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_17 = linear_31.view((1, 2, -1, 64)) + linear_31 = None + key_states_10 = view_17.transpose(1, 2) + view_17 = None + linear_32 = torch._C._nn.linear( + hidden_states_25, + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_18 = linear_32.view((1, 2, -1, 64)) + linear_32 = None + value_states_5 = view_18.transpose(1, 2) + view_18 = None + query_rot_5 = query_states_10[(Ellipsis, slice(None, 32, None))] + query_pass_5 = query_states_10[(Ellipsis, slice(32, None, None))] + query_states_10 = None + key_rot_5 = key_states_10[(Ellipsis, slice(None, 32, None))] + key_pass_5 = key_states_10[(Ellipsis, slice(32, None, None))] + key_states_10 = None + cos_8 = cos_2.unsqueeze(1) + sin_8 = sin_2.unsqueeze(1) + mul_43 = query_rot_5 * cos_8 + x1_10 = query_rot_5[(Ellipsis, slice(None, 16, None))] + x2_10 = query_rot_5[(Ellipsis, slice(16, None, None))] + query_rot_5 = None + neg_10 = -x2_10 + x2_10 = None + cat_21 = torch.cat((neg_10, x1_10), dim=-1) + neg_10 = x1_10 = None + mul_44 = cat_21 * sin_8 + cat_21 = None + q_embed_5 = mul_43 + mul_44 + mul_43 = mul_44 = None + mul_45 = key_rot_5 * cos_8 + cos_8 = None + x1_11 = key_rot_5[(Ellipsis, slice(None, 16, None))] + x2_11 = key_rot_5[(Ellipsis, slice(16, None, None))] + key_rot_5 = None + neg_11 = -x2_11 + x2_11 = None + cat_22 = torch.cat((neg_11, x1_11), dim=-1) + neg_11 = x1_11 = None + mul_46 = cat_22 * sin_8 + cat_22 = sin_8 = None + k_embed_5 = mul_45 + mul_46 + mul_45 = mul_46 = None + query_states_11 = torch.cat((q_embed_5, query_pass_5), dim=-1) + q_embed_5 = query_pass_5 = None + key_states_11 = torch.cat((k_embed_5, key_pass_5), dim=-1) + k_embed_5 = key_pass_5 = None + attention_mask_6 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_5 = query_states_11.contiguous() + query_states_11 = None + key_5 = key_states_11.contiguous() + value_5 = value_states_5.contiguous() + attn_output_20 = torch._C._nn.scaled_dot_product_attention( + query_5, + key_5, + value_5, + attn_mask=attention_mask_6, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_5 = key_5 = value_5 = attention_mask_6 = None + transpose_24 = attn_output_20.transpose(1, 2) + attn_output_20 = None + attn_output_21 = transpose_24.contiguous() + transpose_24 = None + reshape_5 = attn_output_21.reshape(1, 2, -1) + attn_output_21 = None + attn_output_22 = reshape_5.contiguous() + reshape_5 = None + attn_output_23 = torch._C._nn.linear( + attn_output_22, + l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_22 = l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_5 = torch.nn.functional.dropout(attn_output_23, 0.0, False, False) + attn_output_23 = None + hidden_states_26 = torch._C._nn.linear( + hidden_states_25, + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_25 = ( + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_47 = 0.5 * hidden_states_26 + pow_6 = torch.pow(hidden_states_26, 3.0) + mul_48 = 0.044715 * pow_6 + pow_6 = None + add_32 = hidden_states_26 + mul_48 + hidden_states_26 = mul_48 = None + mul_49 = 0.7978845608028654 * add_32 + add_32 = None + tanh_5 = torch.tanh(mul_49) + mul_49 = None + add_33 = 1.0 + tanh_5 + tanh_5 = None + hidden_states_27 = mul_47 * add_33 + mul_47 = add_33 = None + hidden_states_28 = torch._C._nn.linear( + hidden_states_27, + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_27 = ( + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_5 = torch.nn.functional.dropout( + hidden_states_28, 0.0, False, False + ) + hidden_states_28 = None + add_34 = attn_outputs_5 + feed_forward_hidden_states_5 + attn_outputs_5 = feed_forward_hidden_states_5 = None + hidden_states_29 = add_34 + hidden_states_24 + add_34 = hidden_states_24 = None + hidden_states_30 = torch.nn.functional.layer_norm( + hidden_states_29, + (2048,), + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_ + ) = None + linear_36 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_19 = linear_36.view((1, 2, -1, 64)) + linear_36 = None + query_states_12 = view_19.transpose(1, 2) + view_19 = None + linear_37 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_20 = linear_37.view((1, 2, -1, 64)) + linear_37 = None + key_states_12 = view_20.transpose(1, 2) + view_20 = None + linear_38 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_21 = linear_38.view((1, 2, -1, 64)) + linear_38 = None + value_states_6 = view_21.transpose(1, 2) + view_21 = None + query_rot_6 = query_states_12[(Ellipsis, slice(None, 32, None))] + query_pass_6 = query_states_12[(Ellipsis, slice(32, None, None))] + query_states_12 = None + key_rot_6 = key_states_12[(Ellipsis, slice(None, 32, None))] + key_pass_6 = key_states_12[(Ellipsis, slice(32, None, None))] + key_states_12 = None + cos_9 = cos_2.unsqueeze(1) + sin_9 = sin_2.unsqueeze(1) + mul_51 = query_rot_6 * cos_9 + x1_12 = query_rot_6[(Ellipsis, slice(None, 16, None))] + x2_12 = query_rot_6[(Ellipsis, slice(16, None, None))] + query_rot_6 = None + neg_12 = -x2_12 + x2_12 = None + cat_25 = torch.cat((neg_12, x1_12), dim=-1) + neg_12 = x1_12 = None + mul_52 = cat_25 * sin_9 + cat_25 = None + q_embed_6 = mul_51 + mul_52 + mul_51 = mul_52 = None + mul_53 = key_rot_6 * cos_9 + cos_9 = None + x1_13 = key_rot_6[(Ellipsis, slice(None, 16, None))] + x2_13 = key_rot_6[(Ellipsis, slice(16, None, None))] + key_rot_6 = None + neg_13 = -x2_13 + x2_13 = None + cat_26 = torch.cat((neg_13, x1_13), dim=-1) + neg_13 = x1_13 = None + mul_54 = cat_26 * sin_9 + cat_26 = sin_9 = None + k_embed_6 = mul_53 + mul_54 + mul_53 = mul_54 = None + query_states_13 = torch.cat((q_embed_6, query_pass_6), dim=-1) + q_embed_6 = query_pass_6 = None + key_states_13 = torch.cat((k_embed_6, key_pass_6), dim=-1) + k_embed_6 = key_pass_6 = None + attention_mask_7 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_6 = query_states_13.contiguous() + query_states_13 = None + key_6 = key_states_13.contiguous() + value_6 = value_states_6.contiguous() + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + query_6, + key_6, + value_6, + attn_mask=attention_mask_7, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_6 = key_6 = value_6 = attention_mask_7 = None + transpose_28 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_25 = transpose_28.contiguous() + transpose_28 = None + reshape_6 = attn_output_25.reshape(1, 2, -1) + attn_output_25 = None + attn_output_26 = reshape_6.contiguous() + reshape_6 = None + attn_output_27 = torch._C._nn.linear( + attn_output_26, + l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_26 = l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_6 = torch.nn.functional.dropout(attn_output_27, 0.0, False, False) + attn_output_27 = None + hidden_states_31 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_30 = ( + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_55 = 0.5 * hidden_states_31 + pow_7 = torch.pow(hidden_states_31, 3.0) + mul_56 = 0.044715 * pow_7 + pow_7 = None + add_38 = hidden_states_31 + mul_56 + hidden_states_31 = mul_56 = None + mul_57 = 0.7978845608028654 * add_38 + add_38 = None + tanh_6 = torch.tanh(mul_57) + mul_57 = None + add_39 = 1.0 + tanh_6 + tanh_6 = None + hidden_states_32 = mul_55 * add_39 + mul_55 = add_39 = None + hidden_states_33 = torch._C._nn.linear( + hidden_states_32, + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_32 = ( + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_6 = torch.nn.functional.dropout( + hidden_states_33, 0.0, False, False + ) + hidden_states_33 = None + add_40 = attn_outputs_6 + feed_forward_hidden_states_6 + attn_outputs_6 = feed_forward_hidden_states_6 = None + hidden_states_34 = add_40 + hidden_states_29 + add_40 = hidden_states_29 = None + hidden_states_35 = torch.nn.functional.layer_norm( + hidden_states_34, + (2048,), + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_ + ) = None + linear_42 = torch._C._nn.linear( + hidden_states_35, + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_22 = linear_42.view((1, 2, -1, 64)) + linear_42 = None + query_states_14 = view_22.transpose(1, 2) + view_22 = None + linear_43 = torch._C._nn.linear( + hidden_states_35, + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_23 = linear_43.view((1, 2, -1, 64)) + linear_43 = None + key_states_14 = view_23.transpose(1, 2) + view_23 = None + linear_44 = torch._C._nn.linear( + hidden_states_35, + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_24 = linear_44.view((1, 2, -1, 64)) + linear_44 = None + value_states_7 = view_24.transpose(1, 2) + view_24 = None + query_rot_7 = query_states_14[(Ellipsis, slice(None, 32, None))] + query_pass_7 = query_states_14[(Ellipsis, slice(32, None, None))] + query_states_14 = None + key_rot_7 = key_states_14[(Ellipsis, slice(None, 32, None))] + key_pass_7 = key_states_14[(Ellipsis, slice(32, None, None))] + key_states_14 = None + cos_10 = cos_2.unsqueeze(1) + sin_10 = sin_2.unsqueeze(1) + mul_59 = query_rot_7 * cos_10 + x1_14 = query_rot_7[(Ellipsis, slice(None, 16, None))] + x2_14 = query_rot_7[(Ellipsis, slice(16, None, None))] + query_rot_7 = None + neg_14 = -x2_14 + x2_14 = None + cat_29 = torch.cat((neg_14, x1_14), dim=-1) + neg_14 = x1_14 = None + mul_60 = cat_29 * sin_10 + cat_29 = None + q_embed_7 = mul_59 + mul_60 + mul_59 = mul_60 = None + mul_61 = key_rot_7 * cos_10 + cos_10 = None + x1_15 = key_rot_7[(Ellipsis, slice(None, 16, None))] + x2_15 = key_rot_7[(Ellipsis, slice(16, None, None))] + key_rot_7 = None + neg_15 = -x2_15 + x2_15 = None + cat_30 = torch.cat((neg_15, x1_15), dim=-1) + neg_15 = x1_15 = None + mul_62 = cat_30 * sin_10 + cat_30 = sin_10 = None + k_embed_7 = mul_61 + mul_62 + mul_61 = mul_62 = None + query_states_15 = torch.cat((q_embed_7, query_pass_7), dim=-1) + q_embed_7 = query_pass_7 = None + key_states_15 = torch.cat((k_embed_7, key_pass_7), dim=-1) + k_embed_7 = key_pass_7 = None + attention_mask_8 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_7 = query_states_15.contiguous() + query_states_15 = None + key_7 = key_states_15.contiguous() + value_7 = value_states_7.contiguous() + attn_output_28 = torch._C._nn.scaled_dot_product_attention( + query_7, + key_7, + value_7, + attn_mask=attention_mask_8, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_7 = key_7 = value_7 = attention_mask_8 = None + transpose_32 = attn_output_28.transpose(1, 2) + attn_output_28 = None + attn_output_29 = transpose_32.contiguous() + transpose_32 = None + reshape_7 = attn_output_29.reshape(1, 2, -1) + attn_output_29 = None + attn_output_30 = reshape_7.contiguous() + reshape_7 = None + attn_output_31 = torch._C._nn.linear( + attn_output_30, + l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_30 = l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_7 = torch.nn.functional.dropout(attn_output_31, 0.0, False, False) + attn_output_31 = None + hidden_states_36 = torch._C._nn.linear( + hidden_states_35, + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_35 = ( + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_63 = 0.5 * hidden_states_36 + pow_8 = torch.pow(hidden_states_36, 3.0) + mul_64 = 0.044715 * pow_8 + pow_8 = None + add_44 = hidden_states_36 + mul_64 + hidden_states_36 = mul_64 = None + mul_65 = 0.7978845608028654 * add_44 + add_44 = None + tanh_7 = torch.tanh(mul_65) + mul_65 = None + add_45 = 1.0 + tanh_7 + tanh_7 = None + hidden_states_37 = mul_63 * add_45 + mul_63 = add_45 = None + hidden_states_38 = torch._C._nn.linear( + hidden_states_37, + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_37 = ( + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_7 = torch.nn.functional.dropout( + hidden_states_38, 0.0, False, False + ) + hidden_states_38 = None + add_46 = attn_outputs_7 + feed_forward_hidden_states_7 + attn_outputs_7 = feed_forward_hidden_states_7 = None + hidden_states_39 = add_46 + hidden_states_34 + add_46 = hidden_states_34 = None + hidden_states_40 = torch.nn.functional.layer_norm( + hidden_states_39, + (2048,), + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_ + ) = None + linear_48 = torch._C._nn.linear( + hidden_states_40, + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_25 = linear_48.view((1, 2, -1, 64)) + linear_48 = None + query_states_16 = view_25.transpose(1, 2) + view_25 = None + linear_49 = torch._C._nn.linear( + hidden_states_40, + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_26 = linear_49.view((1, 2, -1, 64)) + linear_49 = None + key_states_16 = view_26.transpose(1, 2) + view_26 = None + linear_50 = torch._C._nn.linear( + hidden_states_40, + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_27 = linear_50.view((1, 2, -1, 64)) + linear_50 = None + value_states_8 = view_27.transpose(1, 2) + view_27 = None + query_rot_8 = query_states_16[(Ellipsis, slice(None, 32, None))] + query_pass_8 = query_states_16[(Ellipsis, slice(32, None, None))] + query_states_16 = None + key_rot_8 = key_states_16[(Ellipsis, slice(None, 32, None))] + key_pass_8 = key_states_16[(Ellipsis, slice(32, None, None))] + key_states_16 = None + cos_11 = cos_2.unsqueeze(1) + sin_11 = sin_2.unsqueeze(1) + mul_67 = query_rot_8 * cos_11 + x1_16 = query_rot_8[(Ellipsis, slice(None, 16, None))] + x2_16 = query_rot_8[(Ellipsis, slice(16, None, None))] + query_rot_8 = None + neg_16 = -x2_16 + x2_16 = None + cat_33 = torch.cat((neg_16, x1_16), dim=-1) + neg_16 = x1_16 = None + mul_68 = cat_33 * sin_11 + cat_33 = None + q_embed_8 = mul_67 + mul_68 + mul_67 = mul_68 = None + mul_69 = key_rot_8 * cos_11 + cos_11 = None + x1_17 = key_rot_8[(Ellipsis, slice(None, 16, None))] + x2_17 = key_rot_8[(Ellipsis, slice(16, None, None))] + key_rot_8 = None + neg_17 = -x2_17 + x2_17 = None + cat_34 = torch.cat((neg_17, x1_17), dim=-1) + neg_17 = x1_17 = None + mul_70 = cat_34 * sin_11 + cat_34 = sin_11 = None + k_embed_8 = mul_69 + mul_70 + mul_69 = mul_70 = None + query_states_17 = torch.cat((q_embed_8, query_pass_8), dim=-1) + q_embed_8 = query_pass_8 = None + key_states_17 = torch.cat((k_embed_8, key_pass_8), dim=-1) + k_embed_8 = key_pass_8 = None + attention_mask_9 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_8 = query_states_17.contiguous() + query_states_17 = None + key_8 = key_states_17.contiguous() + value_8 = value_states_8.contiguous() + attn_output_32 = torch._C._nn.scaled_dot_product_attention( + query_8, + key_8, + value_8, + attn_mask=attention_mask_9, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_8 = key_8 = value_8 = attention_mask_9 = None + transpose_36 = attn_output_32.transpose(1, 2) + attn_output_32 = None + attn_output_33 = transpose_36.contiguous() + transpose_36 = None + reshape_8 = attn_output_33.reshape(1, 2, -1) + attn_output_33 = None + attn_output_34 = reshape_8.contiguous() + reshape_8 = None + attn_output_35 = torch._C._nn.linear( + attn_output_34, + l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_34 = l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_8 = torch.nn.functional.dropout(attn_output_35, 0.0, False, False) + attn_output_35 = None + hidden_states_41 = torch._C._nn.linear( + hidden_states_40, + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_40 = ( + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_71 = 0.5 * hidden_states_41 + pow_9 = torch.pow(hidden_states_41, 3.0) + mul_72 = 0.044715 * pow_9 + pow_9 = None + add_50 = hidden_states_41 + mul_72 + hidden_states_41 = mul_72 = None + mul_73 = 0.7978845608028654 * add_50 + add_50 = None + tanh_8 = torch.tanh(mul_73) + mul_73 = None + add_51 = 1.0 + tanh_8 + tanh_8 = None + hidden_states_42 = mul_71 * add_51 + mul_71 = add_51 = None + hidden_states_43 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_42 = ( + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_8 = torch.nn.functional.dropout( + hidden_states_43, 0.0, False, False + ) + hidden_states_43 = None + add_52 = attn_outputs_8 + feed_forward_hidden_states_8 + attn_outputs_8 = feed_forward_hidden_states_8 = None + hidden_states_44 = add_52 + hidden_states_39 + add_52 = hidden_states_39 = None + hidden_states_45 = torch.nn.functional.layer_norm( + hidden_states_44, + (2048,), + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_ + ) = None + linear_54 = torch._C._nn.linear( + hidden_states_45, + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_28 = linear_54.view((1, 2, -1, 64)) + linear_54 = None + query_states_18 = view_28.transpose(1, 2) + view_28 = None + linear_55 = torch._C._nn.linear( + hidden_states_45, + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_29 = linear_55.view((1, 2, -1, 64)) + linear_55 = None + key_states_18 = view_29.transpose(1, 2) + view_29 = None + linear_56 = torch._C._nn.linear( + hidden_states_45, + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_30 = linear_56.view((1, 2, -1, 64)) + linear_56 = None + value_states_9 = view_30.transpose(1, 2) + view_30 = None + query_rot_9 = query_states_18[(Ellipsis, slice(None, 32, None))] + query_pass_9 = query_states_18[(Ellipsis, slice(32, None, None))] + query_states_18 = None + key_rot_9 = key_states_18[(Ellipsis, slice(None, 32, None))] + key_pass_9 = key_states_18[(Ellipsis, slice(32, None, None))] + key_states_18 = None + cos_12 = cos_2.unsqueeze(1) + sin_12 = sin_2.unsqueeze(1) + mul_75 = query_rot_9 * cos_12 + x1_18 = query_rot_9[(Ellipsis, slice(None, 16, None))] + x2_18 = query_rot_9[(Ellipsis, slice(16, None, None))] + query_rot_9 = None + neg_18 = -x2_18 + x2_18 = None + cat_37 = torch.cat((neg_18, x1_18), dim=-1) + neg_18 = x1_18 = None + mul_76 = cat_37 * sin_12 + cat_37 = None + q_embed_9 = mul_75 + mul_76 + mul_75 = mul_76 = None + mul_77 = key_rot_9 * cos_12 + cos_12 = None + x1_19 = key_rot_9[(Ellipsis, slice(None, 16, None))] + x2_19 = key_rot_9[(Ellipsis, slice(16, None, None))] + key_rot_9 = None + neg_19 = -x2_19 + x2_19 = None + cat_38 = torch.cat((neg_19, x1_19), dim=-1) + neg_19 = x1_19 = None + mul_78 = cat_38 * sin_12 + cat_38 = sin_12 = None + k_embed_9 = mul_77 + mul_78 + mul_77 = mul_78 = None + query_states_19 = torch.cat((q_embed_9, query_pass_9), dim=-1) + q_embed_9 = query_pass_9 = None + key_states_19 = torch.cat((k_embed_9, key_pass_9), dim=-1) + k_embed_9 = key_pass_9 = None + attention_mask_10 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_9 = query_states_19.contiguous() + query_states_19 = None + key_9 = key_states_19.contiguous() + value_9 = value_states_9.contiguous() + attn_output_36 = torch._C._nn.scaled_dot_product_attention( + query_9, + key_9, + value_9, + attn_mask=attention_mask_10, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_9 = key_9 = value_9 = attention_mask_10 = None + transpose_40 = attn_output_36.transpose(1, 2) + attn_output_36 = None + attn_output_37 = transpose_40.contiguous() + transpose_40 = None + reshape_9 = attn_output_37.reshape(1, 2, -1) + attn_output_37 = None + attn_output_38 = reshape_9.contiguous() + reshape_9 = None + attn_output_39 = torch._C._nn.linear( + attn_output_38, + l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_38 = l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_9 = torch.nn.functional.dropout(attn_output_39, 0.0, False, False) + attn_output_39 = None + hidden_states_46 = torch._C._nn.linear( + hidden_states_45, + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_45 = ( + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_79 = 0.5 * hidden_states_46 + pow_10 = torch.pow(hidden_states_46, 3.0) + mul_80 = 0.044715 * pow_10 + pow_10 = None + add_56 = hidden_states_46 + mul_80 + hidden_states_46 = mul_80 = None + mul_81 = 0.7978845608028654 * add_56 + add_56 = None + tanh_9 = torch.tanh(mul_81) + mul_81 = None + add_57 = 1.0 + tanh_9 + tanh_9 = None + hidden_states_47 = mul_79 * add_57 + mul_79 = add_57 = None + hidden_states_48 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_47 = ( + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_9 = torch.nn.functional.dropout( + hidden_states_48, 0.0, False, False + ) + hidden_states_48 = None + add_58 = attn_outputs_9 + feed_forward_hidden_states_9 + attn_outputs_9 = feed_forward_hidden_states_9 = None + hidden_states_49 = add_58 + hidden_states_44 + add_58 = hidden_states_44 = None + hidden_states_50 = torch.nn.functional.layer_norm( + hidden_states_49, + (2048,), + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_ + ) = None + linear_60 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_31 = linear_60.view((1, 2, -1, 64)) + linear_60 = None + query_states_20 = view_31.transpose(1, 2) + view_31 = None + linear_61 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_32 = linear_61.view((1, 2, -1, 64)) + linear_61 = None + key_states_20 = view_32.transpose(1, 2) + view_32 = None + linear_62 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_33 = linear_62.view((1, 2, -1, 64)) + linear_62 = None + value_states_10 = view_33.transpose(1, 2) + view_33 = None + query_rot_10 = query_states_20[(Ellipsis, slice(None, 32, None))] + query_pass_10 = query_states_20[(Ellipsis, slice(32, None, None))] + query_states_20 = None + key_rot_10 = key_states_20[(Ellipsis, slice(None, 32, None))] + key_pass_10 = key_states_20[(Ellipsis, slice(32, None, None))] + key_states_20 = None + cos_13 = cos_2.unsqueeze(1) + sin_13 = sin_2.unsqueeze(1) + mul_83 = query_rot_10 * cos_13 + x1_20 = query_rot_10[(Ellipsis, slice(None, 16, None))] + x2_20 = query_rot_10[(Ellipsis, slice(16, None, None))] + query_rot_10 = None + neg_20 = -x2_20 + x2_20 = None + cat_41 = torch.cat((neg_20, x1_20), dim=-1) + neg_20 = x1_20 = None + mul_84 = cat_41 * sin_13 + cat_41 = None + q_embed_10 = mul_83 + mul_84 + mul_83 = mul_84 = None + mul_85 = key_rot_10 * cos_13 + cos_13 = None + x1_21 = key_rot_10[(Ellipsis, slice(None, 16, None))] + x2_21 = key_rot_10[(Ellipsis, slice(16, None, None))] + key_rot_10 = None + neg_21 = -x2_21 + x2_21 = None + cat_42 = torch.cat((neg_21, x1_21), dim=-1) + neg_21 = x1_21 = None + mul_86 = cat_42 * sin_13 + cat_42 = sin_13 = None + k_embed_10 = mul_85 + mul_86 + mul_85 = mul_86 = None + query_states_21 = torch.cat((q_embed_10, query_pass_10), dim=-1) + q_embed_10 = query_pass_10 = None + key_states_21 = torch.cat((k_embed_10, key_pass_10), dim=-1) + k_embed_10 = key_pass_10 = None + attention_mask_11 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_10 = query_states_21.contiguous() + query_states_21 = None + key_10 = key_states_21.contiguous() + value_10 = value_states_10.contiguous() + attn_output_40 = torch._C._nn.scaled_dot_product_attention( + query_10, + key_10, + value_10, + attn_mask=attention_mask_11, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_10 = key_10 = value_10 = attention_mask_11 = None + transpose_44 = attn_output_40.transpose(1, 2) + attn_output_40 = None + attn_output_41 = transpose_44.contiguous() + transpose_44 = None + reshape_10 = attn_output_41.reshape(1, 2, -1) + attn_output_41 = None + attn_output_42 = reshape_10.contiguous() + reshape_10 = None + attn_output_43 = torch._C._nn.linear( + attn_output_42, + l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_42 = l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_10 = torch.nn.functional.dropout(attn_output_43, 0.0, False, False) + attn_output_43 = None + hidden_states_51 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_50 = ( + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_87 = 0.5 * hidden_states_51 + pow_11 = torch.pow(hidden_states_51, 3.0) + mul_88 = 0.044715 * pow_11 + pow_11 = None + add_62 = hidden_states_51 + mul_88 + hidden_states_51 = mul_88 = None + mul_89 = 0.7978845608028654 * add_62 + add_62 = None + tanh_10 = torch.tanh(mul_89) + mul_89 = None + add_63 = 1.0 + tanh_10 + tanh_10 = None + hidden_states_52 = mul_87 * add_63 + mul_87 = add_63 = None + hidden_states_53 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_52 = ( + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_10 = torch.nn.functional.dropout( + hidden_states_53, 0.0, False, False + ) + hidden_states_53 = None + add_64 = attn_outputs_10 + feed_forward_hidden_states_10 + attn_outputs_10 = feed_forward_hidden_states_10 = None + hidden_states_54 = add_64 + hidden_states_49 + add_64 = hidden_states_49 = None + hidden_states_55 = torch.nn.functional.layer_norm( + hidden_states_54, + (2048,), + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_ + ) = None + linear_66 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_34 = linear_66.view((1, 2, -1, 64)) + linear_66 = None + query_states_22 = view_34.transpose(1, 2) + view_34 = None + linear_67 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_35 = linear_67.view((1, 2, -1, 64)) + linear_67 = None + key_states_22 = view_35.transpose(1, 2) + view_35 = None + linear_68 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_36 = linear_68.view((1, 2, -1, 64)) + linear_68 = None + value_states_11 = view_36.transpose(1, 2) + view_36 = None + query_rot_11 = query_states_22[(Ellipsis, slice(None, 32, None))] + query_pass_11 = query_states_22[(Ellipsis, slice(32, None, None))] + query_states_22 = None + key_rot_11 = key_states_22[(Ellipsis, slice(None, 32, None))] + key_pass_11 = key_states_22[(Ellipsis, slice(32, None, None))] + key_states_22 = None + cos_14 = cos_2.unsqueeze(1) + sin_14 = sin_2.unsqueeze(1) + mul_91 = query_rot_11 * cos_14 + x1_22 = query_rot_11[(Ellipsis, slice(None, 16, None))] + x2_22 = query_rot_11[(Ellipsis, slice(16, None, None))] + query_rot_11 = None + neg_22 = -x2_22 + x2_22 = None + cat_45 = torch.cat((neg_22, x1_22), dim=-1) + neg_22 = x1_22 = None + mul_92 = cat_45 * sin_14 + cat_45 = None + q_embed_11 = mul_91 + mul_92 + mul_91 = mul_92 = None + mul_93 = key_rot_11 * cos_14 + cos_14 = None + x1_23 = key_rot_11[(Ellipsis, slice(None, 16, None))] + x2_23 = key_rot_11[(Ellipsis, slice(16, None, None))] + key_rot_11 = None + neg_23 = -x2_23 + x2_23 = None + cat_46 = torch.cat((neg_23, x1_23), dim=-1) + neg_23 = x1_23 = None + mul_94 = cat_46 * sin_14 + cat_46 = sin_14 = None + k_embed_11 = mul_93 + mul_94 + mul_93 = mul_94 = None + query_states_23 = torch.cat((q_embed_11, query_pass_11), dim=-1) + q_embed_11 = query_pass_11 = None + key_states_23 = torch.cat((k_embed_11, key_pass_11), dim=-1) + k_embed_11 = key_pass_11 = None + attention_mask_12 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_11 = query_states_23.contiguous() + query_states_23 = None + key_11 = key_states_23.contiguous() + value_11 = value_states_11.contiguous() + attn_output_44 = torch._C._nn.scaled_dot_product_attention( + query_11, + key_11, + value_11, + attn_mask=attention_mask_12, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_11 = key_11 = value_11 = attention_mask_12 = None + transpose_48 = attn_output_44.transpose(1, 2) + attn_output_44 = None + attn_output_45 = transpose_48.contiguous() + transpose_48 = None + reshape_11 = attn_output_45.reshape(1, 2, -1) + attn_output_45 = None + attn_output_46 = reshape_11.contiguous() + reshape_11 = None + attn_output_47 = torch._C._nn.linear( + attn_output_46, + l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_46 = l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_11 = torch.nn.functional.dropout(attn_output_47, 0.0, False, False) + attn_output_47 = None + hidden_states_56 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_55 = ( + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_95 = 0.5 * hidden_states_56 + pow_12 = torch.pow(hidden_states_56, 3.0) + mul_96 = 0.044715 * pow_12 + pow_12 = None + add_68 = hidden_states_56 + mul_96 + hidden_states_56 = mul_96 = None + mul_97 = 0.7978845608028654 * add_68 + add_68 = None + tanh_11 = torch.tanh(mul_97) + mul_97 = None + add_69 = 1.0 + tanh_11 + tanh_11 = None + hidden_states_57 = mul_95 * add_69 + mul_95 = add_69 = None + hidden_states_58 = torch._C._nn.linear( + hidden_states_57, + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_57 = ( + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_11 = torch.nn.functional.dropout( + hidden_states_58, 0.0, False, False + ) + hidden_states_58 = None + add_70 = attn_outputs_11 + feed_forward_hidden_states_11 + attn_outputs_11 = feed_forward_hidden_states_11 = None + hidden_states_59 = add_70 + hidden_states_54 + add_70 = hidden_states_54 = None + hidden_states_60 = torch.nn.functional.layer_norm( + hidden_states_59, + (2048,), + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_ + ) = None + linear_72 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_37 = linear_72.view((1, 2, -1, 64)) + linear_72 = None + query_states_24 = view_37.transpose(1, 2) + view_37 = None + linear_73 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_38 = linear_73.view((1, 2, -1, 64)) + linear_73 = None + key_states_24 = view_38.transpose(1, 2) + view_38 = None + linear_74 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_39 = linear_74.view((1, 2, -1, 64)) + linear_74 = None + value_states_12 = view_39.transpose(1, 2) + view_39 = None + query_rot_12 = query_states_24[(Ellipsis, slice(None, 32, None))] + query_pass_12 = query_states_24[(Ellipsis, slice(32, None, None))] + query_states_24 = None + key_rot_12 = key_states_24[(Ellipsis, slice(None, 32, None))] + key_pass_12 = key_states_24[(Ellipsis, slice(32, None, None))] + key_states_24 = None + cos_15 = cos_2.unsqueeze(1) + sin_15 = sin_2.unsqueeze(1) + mul_99 = query_rot_12 * cos_15 + x1_24 = query_rot_12[(Ellipsis, slice(None, 16, None))] + x2_24 = query_rot_12[(Ellipsis, slice(16, None, None))] + query_rot_12 = None + neg_24 = -x2_24 + x2_24 = None + cat_49 = torch.cat((neg_24, x1_24), dim=-1) + neg_24 = x1_24 = None + mul_100 = cat_49 * sin_15 + cat_49 = None + q_embed_12 = mul_99 + mul_100 + mul_99 = mul_100 = None + mul_101 = key_rot_12 * cos_15 + cos_15 = None + x1_25 = key_rot_12[(Ellipsis, slice(None, 16, None))] + x2_25 = key_rot_12[(Ellipsis, slice(16, None, None))] + key_rot_12 = None + neg_25 = -x2_25 + x2_25 = None + cat_50 = torch.cat((neg_25, x1_25), dim=-1) + neg_25 = x1_25 = None + mul_102 = cat_50 * sin_15 + cat_50 = sin_15 = None + k_embed_12 = mul_101 + mul_102 + mul_101 = mul_102 = None + query_states_25 = torch.cat((q_embed_12, query_pass_12), dim=-1) + q_embed_12 = query_pass_12 = None + key_states_25 = torch.cat((k_embed_12, key_pass_12), dim=-1) + k_embed_12 = key_pass_12 = None + attention_mask_13 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_12 = query_states_25.contiguous() + query_states_25 = None + key_12 = key_states_25.contiguous() + value_12 = value_states_12.contiguous() + attn_output_48 = torch._C._nn.scaled_dot_product_attention( + query_12, + key_12, + value_12, + attn_mask=attention_mask_13, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_12 = key_12 = value_12 = attention_mask_13 = None + transpose_52 = attn_output_48.transpose(1, 2) + attn_output_48 = None + attn_output_49 = transpose_52.contiguous() + transpose_52 = None + reshape_12 = attn_output_49.reshape(1, 2, -1) + attn_output_49 = None + attn_output_50 = reshape_12.contiguous() + reshape_12 = None + attn_output_51 = torch._C._nn.linear( + attn_output_50, + l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_50 = l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_12 = torch.nn.functional.dropout(attn_output_51, 0.0, False, False) + attn_output_51 = None + hidden_states_61 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_60 = ( + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_103 = 0.5 * hidden_states_61 + pow_13 = torch.pow(hidden_states_61, 3.0) + mul_104 = 0.044715 * pow_13 + pow_13 = None + add_74 = hidden_states_61 + mul_104 + hidden_states_61 = mul_104 = None + mul_105 = 0.7978845608028654 * add_74 + add_74 = None + tanh_12 = torch.tanh(mul_105) + mul_105 = None + add_75 = 1.0 + tanh_12 + tanh_12 = None + hidden_states_62 = mul_103 * add_75 + mul_103 = add_75 = None + hidden_states_63 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_62 = ( + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_12 = torch.nn.functional.dropout( + hidden_states_63, 0.0, False, False + ) + hidden_states_63 = None + add_76 = attn_outputs_12 + feed_forward_hidden_states_12 + attn_outputs_12 = feed_forward_hidden_states_12 = None + hidden_states_64 = add_76 + hidden_states_59 + add_76 = hidden_states_59 = None + hidden_states_65 = torch.nn.functional.layer_norm( + hidden_states_64, + (2048,), + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_ + ) = None + linear_78 = torch._C._nn.linear( + hidden_states_65, + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_40 = linear_78.view((1, 2, -1, 64)) + linear_78 = None + query_states_26 = view_40.transpose(1, 2) + view_40 = None + linear_79 = torch._C._nn.linear( + hidden_states_65, + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_41 = linear_79.view((1, 2, -1, 64)) + linear_79 = None + key_states_26 = view_41.transpose(1, 2) + view_41 = None + linear_80 = torch._C._nn.linear( + hidden_states_65, + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_42 = linear_80.view((1, 2, -1, 64)) + linear_80 = None + value_states_13 = view_42.transpose(1, 2) + view_42 = None + query_rot_13 = query_states_26[(Ellipsis, slice(None, 32, None))] + query_pass_13 = query_states_26[(Ellipsis, slice(32, None, None))] + query_states_26 = None + key_rot_13 = key_states_26[(Ellipsis, slice(None, 32, None))] + key_pass_13 = key_states_26[(Ellipsis, slice(32, None, None))] + key_states_26 = None + cos_16 = cos_2.unsqueeze(1) + sin_16 = sin_2.unsqueeze(1) + mul_107 = query_rot_13 * cos_16 + x1_26 = query_rot_13[(Ellipsis, slice(None, 16, None))] + x2_26 = query_rot_13[(Ellipsis, slice(16, None, None))] + query_rot_13 = None + neg_26 = -x2_26 + x2_26 = None + cat_53 = torch.cat((neg_26, x1_26), dim=-1) + neg_26 = x1_26 = None + mul_108 = cat_53 * sin_16 + cat_53 = None + q_embed_13 = mul_107 + mul_108 + mul_107 = mul_108 = None + mul_109 = key_rot_13 * cos_16 + cos_16 = None + x1_27 = key_rot_13[(Ellipsis, slice(None, 16, None))] + x2_27 = key_rot_13[(Ellipsis, slice(16, None, None))] + key_rot_13 = None + neg_27 = -x2_27 + x2_27 = None + cat_54 = torch.cat((neg_27, x1_27), dim=-1) + neg_27 = x1_27 = None + mul_110 = cat_54 * sin_16 + cat_54 = sin_16 = None + k_embed_13 = mul_109 + mul_110 + mul_109 = mul_110 = None + query_states_27 = torch.cat((q_embed_13, query_pass_13), dim=-1) + q_embed_13 = query_pass_13 = None + key_states_27 = torch.cat((k_embed_13, key_pass_13), dim=-1) + k_embed_13 = key_pass_13 = None + attention_mask_14 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_13 = query_states_27.contiguous() + query_states_27 = None + key_13 = key_states_27.contiguous() + value_13 = value_states_13.contiguous() + attn_output_52 = torch._C._nn.scaled_dot_product_attention( + query_13, + key_13, + value_13, + attn_mask=attention_mask_14, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_13 = key_13 = value_13 = attention_mask_14 = None + transpose_56 = attn_output_52.transpose(1, 2) + attn_output_52 = None + attn_output_53 = transpose_56.contiguous() + transpose_56 = None + reshape_13 = attn_output_53.reshape(1, 2, -1) + attn_output_53 = None + attn_output_54 = reshape_13.contiguous() + reshape_13 = None + attn_output_55 = torch._C._nn.linear( + attn_output_54, + l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_54 = l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_13 = torch.nn.functional.dropout(attn_output_55, 0.0, False, False) + attn_output_55 = None + hidden_states_66 = torch._C._nn.linear( + hidden_states_65, + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_65 = ( + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_111 = 0.5 * hidden_states_66 + pow_14 = torch.pow(hidden_states_66, 3.0) + mul_112 = 0.044715 * pow_14 + pow_14 = None + add_80 = hidden_states_66 + mul_112 + hidden_states_66 = mul_112 = None + mul_113 = 0.7978845608028654 * add_80 + add_80 = None + tanh_13 = torch.tanh(mul_113) + mul_113 = None + add_81 = 1.0 + tanh_13 + tanh_13 = None + hidden_states_67 = mul_111 * add_81 + mul_111 = add_81 = None + hidden_states_68 = torch._C._nn.linear( + hidden_states_67, + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_67 = ( + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_13 = torch.nn.functional.dropout( + hidden_states_68, 0.0, False, False + ) + hidden_states_68 = None + add_82 = attn_outputs_13 + feed_forward_hidden_states_13 + attn_outputs_13 = feed_forward_hidden_states_13 = None + hidden_states_69 = add_82 + hidden_states_64 + add_82 = hidden_states_64 = None + hidden_states_70 = torch.nn.functional.layer_norm( + hidden_states_69, + (2048,), + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_ + ) = None + linear_84 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_43 = linear_84.view((1, 2, -1, 64)) + linear_84 = None + query_states_28 = view_43.transpose(1, 2) + view_43 = None + linear_85 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_44 = linear_85.view((1, 2, -1, 64)) + linear_85 = None + key_states_28 = view_44.transpose(1, 2) + view_44 = None + linear_86 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_45 = linear_86.view((1, 2, -1, 64)) + linear_86 = None + value_states_14 = view_45.transpose(1, 2) + view_45 = None + query_rot_14 = query_states_28[(Ellipsis, slice(None, 32, None))] + query_pass_14 = query_states_28[(Ellipsis, slice(32, None, None))] + query_states_28 = None + key_rot_14 = key_states_28[(Ellipsis, slice(None, 32, None))] + key_pass_14 = key_states_28[(Ellipsis, slice(32, None, None))] + key_states_28 = None + cos_17 = cos_2.unsqueeze(1) + sin_17 = sin_2.unsqueeze(1) + mul_115 = query_rot_14 * cos_17 + x1_28 = query_rot_14[(Ellipsis, slice(None, 16, None))] + x2_28 = query_rot_14[(Ellipsis, slice(16, None, None))] + query_rot_14 = None + neg_28 = -x2_28 + x2_28 = None + cat_57 = torch.cat((neg_28, x1_28), dim=-1) + neg_28 = x1_28 = None + mul_116 = cat_57 * sin_17 + cat_57 = None + q_embed_14 = mul_115 + mul_116 + mul_115 = mul_116 = None + mul_117 = key_rot_14 * cos_17 + cos_17 = None + x1_29 = key_rot_14[(Ellipsis, slice(None, 16, None))] + x2_29 = key_rot_14[(Ellipsis, slice(16, None, None))] + key_rot_14 = None + neg_29 = -x2_29 + x2_29 = None + cat_58 = torch.cat((neg_29, x1_29), dim=-1) + neg_29 = x1_29 = None + mul_118 = cat_58 * sin_17 + cat_58 = sin_17 = None + k_embed_14 = mul_117 + mul_118 + mul_117 = mul_118 = None + query_states_29 = torch.cat((q_embed_14, query_pass_14), dim=-1) + q_embed_14 = query_pass_14 = None + key_states_29 = torch.cat((k_embed_14, key_pass_14), dim=-1) + k_embed_14 = key_pass_14 = None + attention_mask_15 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_14 = query_states_29.contiguous() + query_states_29 = None + key_14 = key_states_29.contiguous() + value_14 = value_states_14.contiguous() + attn_output_56 = torch._C._nn.scaled_dot_product_attention( + query_14, + key_14, + value_14, + attn_mask=attention_mask_15, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_14 = key_14 = value_14 = attention_mask_15 = None + transpose_60 = attn_output_56.transpose(1, 2) + attn_output_56 = None + attn_output_57 = transpose_60.contiguous() + transpose_60 = None + reshape_14 = attn_output_57.reshape(1, 2, -1) + attn_output_57 = None + attn_output_58 = reshape_14.contiguous() + reshape_14 = None + attn_output_59 = torch._C._nn.linear( + attn_output_58, + l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_58 = l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_14 = torch.nn.functional.dropout(attn_output_59, 0.0, False, False) + attn_output_59 = None + hidden_states_71 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_70 = ( + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_119 = 0.5 * hidden_states_71 + pow_15 = torch.pow(hidden_states_71, 3.0) + mul_120 = 0.044715 * pow_15 + pow_15 = None + add_86 = hidden_states_71 + mul_120 + hidden_states_71 = mul_120 = None + mul_121 = 0.7978845608028654 * add_86 + add_86 = None + tanh_14 = torch.tanh(mul_121) + mul_121 = None + add_87 = 1.0 + tanh_14 + tanh_14 = None + hidden_states_72 = mul_119 * add_87 + mul_119 = add_87 = None + hidden_states_73 = torch._C._nn.linear( + hidden_states_72, + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_72 = ( + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_14 = torch.nn.functional.dropout( + hidden_states_73, 0.0, False, False + ) + hidden_states_73 = None + add_88 = attn_outputs_14 + feed_forward_hidden_states_14 + attn_outputs_14 = feed_forward_hidden_states_14 = None + hidden_states_74 = add_88 + hidden_states_69 + add_88 = hidden_states_69 = None + hidden_states_75 = torch.nn.functional.layer_norm( + hidden_states_74, + (2048,), + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_ + ) = None + linear_90 = torch._C._nn.linear( + hidden_states_75, + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_46 = linear_90.view((1, 2, -1, 64)) + linear_90 = None + query_states_30 = view_46.transpose(1, 2) + view_46 = None + linear_91 = torch._C._nn.linear( + hidden_states_75, + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_47 = linear_91.view((1, 2, -1, 64)) + linear_91 = None + key_states_30 = view_47.transpose(1, 2) + view_47 = None + linear_92 = torch._C._nn.linear( + hidden_states_75, + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_48 = linear_92.view((1, 2, -1, 64)) + linear_92 = None + value_states_15 = view_48.transpose(1, 2) + view_48 = None + query_rot_15 = query_states_30[(Ellipsis, slice(None, 32, None))] + query_pass_15 = query_states_30[(Ellipsis, slice(32, None, None))] + query_states_30 = None + key_rot_15 = key_states_30[(Ellipsis, slice(None, 32, None))] + key_pass_15 = key_states_30[(Ellipsis, slice(32, None, None))] + key_states_30 = None + cos_18 = cos_2.unsqueeze(1) + sin_18 = sin_2.unsqueeze(1) + mul_123 = query_rot_15 * cos_18 + x1_30 = query_rot_15[(Ellipsis, slice(None, 16, None))] + x2_30 = query_rot_15[(Ellipsis, slice(16, None, None))] + query_rot_15 = None + neg_30 = -x2_30 + x2_30 = None + cat_61 = torch.cat((neg_30, x1_30), dim=-1) + neg_30 = x1_30 = None + mul_124 = cat_61 * sin_18 + cat_61 = None + q_embed_15 = mul_123 + mul_124 + mul_123 = mul_124 = None + mul_125 = key_rot_15 * cos_18 + cos_18 = None + x1_31 = key_rot_15[(Ellipsis, slice(None, 16, None))] + x2_31 = key_rot_15[(Ellipsis, slice(16, None, None))] + key_rot_15 = None + neg_31 = -x2_31 + x2_31 = None + cat_62 = torch.cat((neg_31, x1_31), dim=-1) + neg_31 = x1_31 = None + mul_126 = cat_62 * sin_18 + cat_62 = sin_18 = None + k_embed_15 = mul_125 + mul_126 + mul_125 = mul_126 = None + query_states_31 = torch.cat((q_embed_15, query_pass_15), dim=-1) + q_embed_15 = query_pass_15 = None + key_states_31 = torch.cat((k_embed_15, key_pass_15), dim=-1) + k_embed_15 = key_pass_15 = None + attention_mask_16 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_15 = query_states_31.contiguous() + query_states_31 = None + key_15 = key_states_31.contiguous() + value_15 = value_states_15.contiguous() + attn_output_60 = torch._C._nn.scaled_dot_product_attention( + query_15, + key_15, + value_15, + attn_mask=attention_mask_16, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_15 = key_15 = value_15 = attention_mask_16 = None + transpose_64 = attn_output_60.transpose(1, 2) + attn_output_60 = None + attn_output_61 = transpose_64.contiguous() + transpose_64 = None + reshape_15 = attn_output_61.reshape(1, 2, -1) + attn_output_61 = None + attn_output_62 = reshape_15.contiguous() + reshape_15 = None + attn_output_63 = torch._C._nn.linear( + attn_output_62, + l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_62 = l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_15 = torch.nn.functional.dropout(attn_output_63, 0.0, False, False) + attn_output_63 = None + hidden_states_76 = torch._C._nn.linear( + hidden_states_75, + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_75 = ( + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_127 = 0.5 * hidden_states_76 + pow_16 = torch.pow(hidden_states_76, 3.0) + mul_128 = 0.044715 * pow_16 + pow_16 = None + add_92 = hidden_states_76 + mul_128 + hidden_states_76 = mul_128 = None + mul_129 = 0.7978845608028654 * add_92 + add_92 = None + tanh_15 = torch.tanh(mul_129) + mul_129 = None + add_93 = 1.0 + tanh_15 + tanh_15 = None + hidden_states_77 = mul_127 * add_93 + mul_127 = add_93 = None + hidden_states_78 = torch._C._nn.linear( + hidden_states_77, + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_77 = ( + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_15 = torch.nn.functional.dropout( + hidden_states_78, 0.0, False, False + ) + hidden_states_78 = None + add_94 = attn_outputs_15 + feed_forward_hidden_states_15 + attn_outputs_15 = feed_forward_hidden_states_15 = None + hidden_states_79 = add_94 + hidden_states_74 + add_94 = hidden_states_74 = None + hidden_states_80 = torch.nn.functional.layer_norm( + hidden_states_79, + (2048,), + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_ + ) = None + linear_96 = torch._C._nn.linear( + hidden_states_80, + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_49 = linear_96.view((1, 2, -1, 64)) + linear_96 = None + query_states_32 = view_49.transpose(1, 2) + view_49 = None + linear_97 = torch._C._nn.linear( + hidden_states_80, + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_50 = linear_97.view((1, 2, -1, 64)) + linear_97 = None + key_states_32 = view_50.transpose(1, 2) + view_50 = None + linear_98 = torch._C._nn.linear( + hidden_states_80, + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_51 = linear_98.view((1, 2, -1, 64)) + linear_98 = None + value_states_16 = view_51.transpose(1, 2) + view_51 = None + query_rot_16 = query_states_32[(Ellipsis, slice(None, 32, None))] + query_pass_16 = query_states_32[(Ellipsis, slice(32, None, None))] + query_states_32 = None + key_rot_16 = key_states_32[(Ellipsis, slice(None, 32, None))] + key_pass_16 = key_states_32[(Ellipsis, slice(32, None, None))] + key_states_32 = None + cos_19 = cos_2.unsqueeze(1) + sin_19 = sin_2.unsqueeze(1) + mul_131 = query_rot_16 * cos_19 + x1_32 = query_rot_16[(Ellipsis, slice(None, 16, None))] + x2_32 = query_rot_16[(Ellipsis, slice(16, None, None))] + query_rot_16 = None + neg_32 = -x2_32 + x2_32 = None + cat_65 = torch.cat((neg_32, x1_32), dim=-1) + neg_32 = x1_32 = None + mul_132 = cat_65 * sin_19 + cat_65 = None + q_embed_16 = mul_131 + mul_132 + mul_131 = mul_132 = None + mul_133 = key_rot_16 * cos_19 + cos_19 = None + x1_33 = key_rot_16[(Ellipsis, slice(None, 16, None))] + x2_33 = key_rot_16[(Ellipsis, slice(16, None, None))] + key_rot_16 = None + neg_33 = -x2_33 + x2_33 = None + cat_66 = torch.cat((neg_33, x1_33), dim=-1) + neg_33 = x1_33 = None + mul_134 = cat_66 * sin_19 + cat_66 = sin_19 = None + k_embed_16 = mul_133 + mul_134 + mul_133 = mul_134 = None + query_states_33 = torch.cat((q_embed_16, query_pass_16), dim=-1) + q_embed_16 = query_pass_16 = None + key_states_33 = torch.cat((k_embed_16, key_pass_16), dim=-1) + k_embed_16 = key_pass_16 = None + attention_mask_17 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_16 = query_states_33.contiguous() + query_states_33 = None + key_16 = key_states_33.contiguous() + value_16 = value_states_16.contiguous() + attn_output_64 = torch._C._nn.scaled_dot_product_attention( + query_16, + key_16, + value_16, + attn_mask=attention_mask_17, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_16 = key_16 = value_16 = attention_mask_17 = None + transpose_68 = attn_output_64.transpose(1, 2) + attn_output_64 = None + attn_output_65 = transpose_68.contiguous() + transpose_68 = None + reshape_16 = attn_output_65.reshape(1, 2, -1) + attn_output_65 = None + attn_output_66 = reshape_16.contiguous() + reshape_16 = None + attn_output_67 = torch._C._nn.linear( + attn_output_66, + l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_66 = l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_16 = torch.nn.functional.dropout(attn_output_67, 0.0, False, False) + attn_output_67 = None + hidden_states_81 = torch._C._nn.linear( + hidden_states_80, + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_80 = ( + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_135 = 0.5 * hidden_states_81 + pow_17 = torch.pow(hidden_states_81, 3.0) + mul_136 = 0.044715 * pow_17 + pow_17 = None + add_98 = hidden_states_81 + mul_136 + hidden_states_81 = mul_136 = None + mul_137 = 0.7978845608028654 * add_98 + add_98 = None + tanh_16 = torch.tanh(mul_137) + mul_137 = None + add_99 = 1.0 + tanh_16 + tanh_16 = None + hidden_states_82 = mul_135 * add_99 + mul_135 = add_99 = None + hidden_states_83 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_82 = ( + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_16 = torch.nn.functional.dropout( + hidden_states_83, 0.0, False, False + ) + hidden_states_83 = None + add_100 = attn_outputs_16 + feed_forward_hidden_states_16 + attn_outputs_16 = feed_forward_hidden_states_16 = None + hidden_states_84 = add_100 + hidden_states_79 + add_100 = hidden_states_79 = None + hidden_states_85 = torch.nn.functional.layer_norm( + hidden_states_84, + (2048,), + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_ + ) = None + linear_102 = torch._C._nn.linear( + hidden_states_85, + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_52 = linear_102.view((1, 2, -1, 64)) + linear_102 = None + query_states_34 = view_52.transpose(1, 2) + view_52 = None + linear_103 = torch._C._nn.linear( + hidden_states_85, + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_53 = linear_103.view((1, 2, -1, 64)) + linear_103 = None + key_states_34 = view_53.transpose(1, 2) + view_53 = None + linear_104 = torch._C._nn.linear( + hidden_states_85, + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_54 = linear_104.view((1, 2, -1, 64)) + linear_104 = None + value_states_17 = view_54.transpose(1, 2) + view_54 = None + query_rot_17 = query_states_34[(Ellipsis, slice(None, 32, None))] + query_pass_17 = query_states_34[(Ellipsis, slice(32, None, None))] + query_states_34 = None + key_rot_17 = key_states_34[(Ellipsis, slice(None, 32, None))] + key_pass_17 = key_states_34[(Ellipsis, slice(32, None, None))] + key_states_34 = None + cos_20 = cos_2.unsqueeze(1) + sin_20 = sin_2.unsqueeze(1) + mul_139 = query_rot_17 * cos_20 + x1_34 = query_rot_17[(Ellipsis, slice(None, 16, None))] + x2_34 = query_rot_17[(Ellipsis, slice(16, None, None))] + query_rot_17 = None + neg_34 = -x2_34 + x2_34 = None + cat_69 = torch.cat((neg_34, x1_34), dim=-1) + neg_34 = x1_34 = None + mul_140 = cat_69 * sin_20 + cat_69 = None + q_embed_17 = mul_139 + mul_140 + mul_139 = mul_140 = None + mul_141 = key_rot_17 * cos_20 + cos_20 = None + x1_35 = key_rot_17[(Ellipsis, slice(None, 16, None))] + x2_35 = key_rot_17[(Ellipsis, slice(16, None, None))] + key_rot_17 = None + neg_35 = -x2_35 + x2_35 = None + cat_70 = torch.cat((neg_35, x1_35), dim=-1) + neg_35 = x1_35 = None + mul_142 = cat_70 * sin_20 + cat_70 = sin_20 = None + k_embed_17 = mul_141 + mul_142 + mul_141 = mul_142 = None + query_states_35 = torch.cat((q_embed_17, query_pass_17), dim=-1) + q_embed_17 = query_pass_17 = None + key_states_35 = torch.cat((k_embed_17, key_pass_17), dim=-1) + k_embed_17 = key_pass_17 = None + attention_mask_18 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_17 = query_states_35.contiguous() + query_states_35 = None + key_17 = key_states_35.contiguous() + value_17 = value_states_17.contiguous() + attn_output_68 = torch._C._nn.scaled_dot_product_attention( + query_17, + key_17, + value_17, + attn_mask=attention_mask_18, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_17 = key_17 = value_17 = attention_mask_18 = None + transpose_72 = attn_output_68.transpose(1, 2) + attn_output_68 = None + attn_output_69 = transpose_72.contiguous() + transpose_72 = None + reshape_17 = attn_output_69.reshape(1, 2, -1) + attn_output_69 = None + attn_output_70 = reshape_17.contiguous() + reshape_17 = None + attn_output_71 = torch._C._nn.linear( + attn_output_70, + l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_70 = l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_17 = torch.nn.functional.dropout(attn_output_71, 0.0, False, False) + attn_output_71 = None + hidden_states_86 = torch._C._nn.linear( + hidden_states_85, + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_85 = ( + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_143 = 0.5 * hidden_states_86 + pow_18 = torch.pow(hidden_states_86, 3.0) + mul_144 = 0.044715 * pow_18 + pow_18 = None + add_104 = hidden_states_86 + mul_144 + hidden_states_86 = mul_144 = None + mul_145 = 0.7978845608028654 * add_104 + add_104 = None + tanh_17 = torch.tanh(mul_145) + mul_145 = None + add_105 = 1.0 + tanh_17 + tanh_17 = None + hidden_states_87 = mul_143 * add_105 + mul_143 = add_105 = None + hidden_states_88 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_87 = ( + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_17 = torch.nn.functional.dropout( + hidden_states_88, 0.0, False, False + ) + hidden_states_88 = None + add_106 = attn_outputs_17 + feed_forward_hidden_states_17 + attn_outputs_17 = feed_forward_hidden_states_17 = None + hidden_states_89 = add_106 + hidden_states_84 + add_106 = hidden_states_84 = None + hidden_states_90 = torch.nn.functional.layer_norm( + hidden_states_89, + (2048,), + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_ + ) = None + linear_108 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_55 = linear_108.view((1, 2, -1, 64)) + linear_108 = None + query_states_36 = view_55.transpose(1, 2) + view_55 = None + linear_109 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_56 = linear_109.view((1, 2, -1, 64)) + linear_109 = None + key_states_36 = view_56.transpose(1, 2) + view_56 = None + linear_110 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_57 = linear_110.view((1, 2, -1, 64)) + linear_110 = None + value_states_18 = view_57.transpose(1, 2) + view_57 = None + query_rot_18 = query_states_36[(Ellipsis, slice(None, 32, None))] + query_pass_18 = query_states_36[(Ellipsis, slice(32, None, None))] + query_states_36 = None + key_rot_18 = key_states_36[(Ellipsis, slice(None, 32, None))] + key_pass_18 = key_states_36[(Ellipsis, slice(32, None, None))] + key_states_36 = None + cos_21 = cos_2.unsqueeze(1) + sin_21 = sin_2.unsqueeze(1) + mul_147 = query_rot_18 * cos_21 + x1_36 = query_rot_18[(Ellipsis, slice(None, 16, None))] + x2_36 = query_rot_18[(Ellipsis, slice(16, None, None))] + query_rot_18 = None + neg_36 = -x2_36 + x2_36 = None + cat_73 = torch.cat((neg_36, x1_36), dim=-1) + neg_36 = x1_36 = None + mul_148 = cat_73 * sin_21 + cat_73 = None + q_embed_18 = mul_147 + mul_148 + mul_147 = mul_148 = None + mul_149 = key_rot_18 * cos_21 + cos_21 = None + x1_37 = key_rot_18[(Ellipsis, slice(None, 16, None))] + x2_37 = key_rot_18[(Ellipsis, slice(16, None, None))] + key_rot_18 = None + neg_37 = -x2_37 + x2_37 = None + cat_74 = torch.cat((neg_37, x1_37), dim=-1) + neg_37 = x1_37 = None + mul_150 = cat_74 * sin_21 + cat_74 = sin_21 = None + k_embed_18 = mul_149 + mul_150 + mul_149 = mul_150 = None + query_states_37 = torch.cat((q_embed_18, query_pass_18), dim=-1) + q_embed_18 = query_pass_18 = None + key_states_37 = torch.cat((k_embed_18, key_pass_18), dim=-1) + k_embed_18 = key_pass_18 = None + attention_mask_19 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_18 = query_states_37.contiguous() + query_states_37 = None + key_18 = key_states_37.contiguous() + value_18 = value_states_18.contiguous() + attn_output_72 = torch._C._nn.scaled_dot_product_attention( + query_18, + key_18, + value_18, + attn_mask=attention_mask_19, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_18 = key_18 = value_18 = attention_mask_19 = None + transpose_76 = attn_output_72.transpose(1, 2) + attn_output_72 = None + attn_output_73 = transpose_76.contiguous() + transpose_76 = None + reshape_18 = attn_output_73.reshape(1, 2, -1) + attn_output_73 = None + attn_output_74 = reshape_18.contiguous() + reshape_18 = None + attn_output_75 = torch._C._nn.linear( + attn_output_74, + l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_74 = l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_18 = torch.nn.functional.dropout(attn_output_75, 0.0, False, False) + attn_output_75 = None + hidden_states_91 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_90 = ( + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_151 = 0.5 * hidden_states_91 + pow_19 = torch.pow(hidden_states_91, 3.0) + mul_152 = 0.044715 * pow_19 + pow_19 = None + add_110 = hidden_states_91 + mul_152 + hidden_states_91 = mul_152 = None + mul_153 = 0.7978845608028654 * add_110 + add_110 = None + tanh_18 = torch.tanh(mul_153) + mul_153 = None + add_111 = 1.0 + tanh_18 + tanh_18 = None + hidden_states_92 = mul_151 * add_111 + mul_151 = add_111 = None + hidden_states_93 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_92 = ( + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_18 = torch.nn.functional.dropout( + hidden_states_93, 0.0, False, False + ) + hidden_states_93 = None + add_112 = attn_outputs_18 + feed_forward_hidden_states_18 + attn_outputs_18 = feed_forward_hidden_states_18 = None + hidden_states_94 = add_112 + hidden_states_89 + add_112 = hidden_states_89 = None + hidden_states_95 = torch.nn.functional.layer_norm( + hidden_states_94, + (2048,), + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_ + ) = None + linear_114 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_58 = linear_114.view((1, 2, -1, 64)) + linear_114 = None + query_states_38 = view_58.transpose(1, 2) + view_58 = None + linear_115 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_59 = linear_115.view((1, 2, -1, 64)) + linear_115 = None + key_states_38 = view_59.transpose(1, 2) + view_59 = None + linear_116 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_60 = linear_116.view((1, 2, -1, 64)) + linear_116 = None + value_states_19 = view_60.transpose(1, 2) + view_60 = None + query_rot_19 = query_states_38[(Ellipsis, slice(None, 32, None))] + query_pass_19 = query_states_38[(Ellipsis, slice(32, None, None))] + query_states_38 = None + key_rot_19 = key_states_38[(Ellipsis, slice(None, 32, None))] + key_pass_19 = key_states_38[(Ellipsis, slice(32, None, None))] + key_states_38 = None + cos_22 = cos_2.unsqueeze(1) + sin_22 = sin_2.unsqueeze(1) + mul_155 = query_rot_19 * cos_22 + x1_38 = query_rot_19[(Ellipsis, slice(None, 16, None))] + x2_38 = query_rot_19[(Ellipsis, slice(16, None, None))] + query_rot_19 = None + neg_38 = -x2_38 + x2_38 = None + cat_77 = torch.cat((neg_38, x1_38), dim=-1) + neg_38 = x1_38 = None + mul_156 = cat_77 * sin_22 + cat_77 = None + q_embed_19 = mul_155 + mul_156 + mul_155 = mul_156 = None + mul_157 = key_rot_19 * cos_22 + cos_22 = None + x1_39 = key_rot_19[(Ellipsis, slice(None, 16, None))] + x2_39 = key_rot_19[(Ellipsis, slice(16, None, None))] + key_rot_19 = None + neg_39 = -x2_39 + x2_39 = None + cat_78 = torch.cat((neg_39, x1_39), dim=-1) + neg_39 = x1_39 = None + mul_158 = cat_78 * sin_22 + cat_78 = sin_22 = None + k_embed_19 = mul_157 + mul_158 + mul_157 = mul_158 = None + query_states_39 = torch.cat((q_embed_19, query_pass_19), dim=-1) + q_embed_19 = query_pass_19 = None + key_states_39 = torch.cat((k_embed_19, key_pass_19), dim=-1) + k_embed_19 = key_pass_19 = None + attention_mask_20 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_19 = query_states_39.contiguous() + query_states_39 = None + key_19 = key_states_39.contiguous() + value_19 = value_states_19.contiguous() + attn_output_76 = torch._C._nn.scaled_dot_product_attention( + query_19, + key_19, + value_19, + attn_mask=attention_mask_20, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_19 = key_19 = value_19 = attention_mask_20 = None + transpose_80 = attn_output_76.transpose(1, 2) + attn_output_76 = None + attn_output_77 = transpose_80.contiguous() + transpose_80 = None + reshape_19 = attn_output_77.reshape(1, 2, -1) + attn_output_77 = None + attn_output_78 = reshape_19.contiguous() + reshape_19 = None + attn_output_79 = torch._C._nn.linear( + attn_output_78, + l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_78 = l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_19 = torch.nn.functional.dropout(attn_output_79, 0.0, False, False) + attn_output_79 = None + hidden_states_96 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_95 = ( + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_159 = 0.5 * hidden_states_96 + pow_20 = torch.pow(hidden_states_96, 3.0) + mul_160 = 0.044715 * pow_20 + pow_20 = None + add_116 = hidden_states_96 + mul_160 + hidden_states_96 = mul_160 = None + mul_161 = 0.7978845608028654 * add_116 + add_116 = None + tanh_19 = torch.tanh(mul_161) + mul_161 = None + add_117 = 1.0 + tanh_19 + tanh_19 = None + hidden_states_97 = mul_159 * add_117 + mul_159 = add_117 = None + hidden_states_98 = torch._C._nn.linear( + hidden_states_97, + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_97 = ( + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_19 = torch.nn.functional.dropout( + hidden_states_98, 0.0, False, False + ) + hidden_states_98 = None + add_118 = attn_outputs_19 + feed_forward_hidden_states_19 + attn_outputs_19 = feed_forward_hidden_states_19 = None + hidden_states_99 = add_118 + hidden_states_94 + add_118 = hidden_states_94 = None + hidden_states_100 = torch.nn.functional.layer_norm( + hidden_states_99, + (2048,), + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_ + ) = None + linear_120 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_61 = linear_120.view((1, 2, -1, 64)) + linear_120 = None + query_states_40 = view_61.transpose(1, 2) + view_61 = None + linear_121 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_62 = linear_121.view((1, 2, -1, 64)) + linear_121 = None + key_states_40 = view_62.transpose(1, 2) + view_62 = None + linear_122 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_63 = linear_122.view((1, 2, -1, 64)) + linear_122 = None + value_states_20 = view_63.transpose(1, 2) + view_63 = None + query_rot_20 = query_states_40[(Ellipsis, slice(None, 32, None))] + query_pass_20 = query_states_40[(Ellipsis, slice(32, None, None))] + query_states_40 = None + key_rot_20 = key_states_40[(Ellipsis, slice(None, 32, None))] + key_pass_20 = key_states_40[(Ellipsis, slice(32, None, None))] + key_states_40 = None + cos_23 = cos_2.unsqueeze(1) + sin_23 = sin_2.unsqueeze(1) + mul_163 = query_rot_20 * cos_23 + x1_40 = query_rot_20[(Ellipsis, slice(None, 16, None))] + x2_40 = query_rot_20[(Ellipsis, slice(16, None, None))] + query_rot_20 = None + neg_40 = -x2_40 + x2_40 = None + cat_81 = torch.cat((neg_40, x1_40), dim=-1) + neg_40 = x1_40 = None + mul_164 = cat_81 * sin_23 + cat_81 = None + q_embed_20 = mul_163 + mul_164 + mul_163 = mul_164 = None + mul_165 = key_rot_20 * cos_23 + cos_23 = None + x1_41 = key_rot_20[(Ellipsis, slice(None, 16, None))] + x2_41 = key_rot_20[(Ellipsis, slice(16, None, None))] + key_rot_20 = None + neg_41 = -x2_41 + x2_41 = None + cat_82 = torch.cat((neg_41, x1_41), dim=-1) + neg_41 = x1_41 = None + mul_166 = cat_82 * sin_23 + cat_82 = sin_23 = None + k_embed_20 = mul_165 + mul_166 + mul_165 = mul_166 = None + query_states_41 = torch.cat((q_embed_20, query_pass_20), dim=-1) + q_embed_20 = query_pass_20 = None + key_states_41 = torch.cat((k_embed_20, key_pass_20), dim=-1) + k_embed_20 = key_pass_20 = None + attention_mask_21 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_20 = query_states_41.contiguous() + query_states_41 = None + key_20 = key_states_41.contiguous() + value_20 = value_states_20.contiguous() + attn_output_80 = torch._C._nn.scaled_dot_product_attention( + query_20, + key_20, + value_20, + attn_mask=attention_mask_21, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_20 = key_20 = value_20 = attention_mask_21 = None + transpose_84 = attn_output_80.transpose(1, 2) + attn_output_80 = None + attn_output_81 = transpose_84.contiguous() + transpose_84 = None + reshape_20 = attn_output_81.reshape(1, 2, -1) + attn_output_81 = None + attn_output_82 = reshape_20.contiguous() + reshape_20 = None + attn_output_83 = torch._C._nn.linear( + attn_output_82, + l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_82 = l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_20 = torch.nn.functional.dropout(attn_output_83, 0.0, False, False) + attn_output_83 = None + hidden_states_101 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_100 = ( + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_167 = 0.5 * hidden_states_101 + pow_21 = torch.pow(hidden_states_101, 3.0) + mul_168 = 0.044715 * pow_21 + pow_21 = None + add_122 = hidden_states_101 + mul_168 + hidden_states_101 = mul_168 = None + mul_169 = 0.7978845608028654 * add_122 + add_122 = None + tanh_20 = torch.tanh(mul_169) + mul_169 = None + add_123 = 1.0 + tanh_20 + tanh_20 = None + hidden_states_102 = mul_167 * add_123 + mul_167 = add_123 = None + hidden_states_103 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_102 = ( + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_20 = torch.nn.functional.dropout( + hidden_states_103, 0.0, False, False + ) + hidden_states_103 = None + add_124 = attn_outputs_20 + feed_forward_hidden_states_20 + attn_outputs_20 = feed_forward_hidden_states_20 = None + hidden_states_104 = add_124 + hidden_states_99 + add_124 = hidden_states_99 = None + hidden_states_105 = torch.nn.functional.layer_norm( + hidden_states_104, + (2048,), + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_ + ) = None + linear_126 = torch._C._nn.linear( + hidden_states_105, + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_64 = linear_126.view((1, 2, -1, 64)) + linear_126 = None + query_states_42 = view_64.transpose(1, 2) + view_64 = None + linear_127 = torch._C._nn.linear( + hidden_states_105, + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_65 = linear_127.view((1, 2, -1, 64)) + linear_127 = None + key_states_42 = view_65.transpose(1, 2) + view_65 = None + linear_128 = torch._C._nn.linear( + hidden_states_105, + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_66 = linear_128.view((1, 2, -1, 64)) + linear_128 = None + value_states_21 = view_66.transpose(1, 2) + view_66 = None + query_rot_21 = query_states_42[(Ellipsis, slice(None, 32, None))] + query_pass_21 = query_states_42[(Ellipsis, slice(32, None, None))] + query_states_42 = None + key_rot_21 = key_states_42[(Ellipsis, slice(None, 32, None))] + key_pass_21 = key_states_42[(Ellipsis, slice(32, None, None))] + key_states_42 = None + cos_24 = cos_2.unsqueeze(1) + sin_24 = sin_2.unsqueeze(1) + mul_171 = query_rot_21 * cos_24 + x1_42 = query_rot_21[(Ellipsis, slice(None, 16, None))] + x2_42 = query_rot_21[(Ellipsis, slice(16, None, None))] + query_rot_21 = None + neg_42 = -x2_42 + x2_42 = None + cat_85 = torch.cat((neg_42, x1_42), dim=-1) + neg_42 = x1_42 = None + mul_172 = cat_85 * sin_24 + cat_85 = None + q_embed_21 = mul_171 + mul_172 + mul_171 = mul_172 = None + mul_173 = key_rot_21 * cos_24 + cos_24 = None + x1_43 = key_rot_21[(Ellipsis, slice(None, 16, None))] + x2_43 = key_rot_21[(Ellipsis, slice(16, None, None))] + key_rot_21 = None + neg_43 = -x2_43 + x2_43 = None + cat_86 = torch.cat((neg_43, x1_43), dim=-1) + neg_43 = x1_43 = None + mul_174 = cat_86 * sin_24 + cat_86 = sin_24 = None + k_embed_21 = mul_173 + mul_174 + mul_173 = mul_174 = None + query_states_43 = torch.cat((q_embed_21, query_pass_21), dim=-1) + q_embed_21 = query_pass_21 = None + key_states_43 = torch.cat((k_embed_21, key_pass_21), dim=-1) + k_embed_21 = key_pass_21 = None + attention_mask_22 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_21 = query_states_43.contiguous() + query_states_43 = None + key_21 = key_states_43.contiguous() + value_21 = value_states_21.contiguous() + attn_output_84 = torch._C._nn.scaled_dot_product_attention( + query_21, + key_21, + value_21, + attn_mask=attention_mask_22, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_21 = key_21 = value_21 = attention_mask_22 = None + transpose_88 = attn_output_84.transpose(1, 2) + attn_output_84 = None + attn_output_85 = transpose_88.contiguous() + transpose_88 = None + reshape_21 = attn_output_85.reshape(1, 2, -1) + attn_output_85 = None + attn_output_86 = reshape_21.contiguous() + reshape_21 = None + attn_output_87 = torch._C._nn.linear( + attn_output_86, + l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_86 = l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_21 = torch.nn.functional.dropout(attn_output_87, 0.0, False, False) + attn_output_87 = None + hidden_states_106 = torch._C._nn.linear( + hidden_states_105, + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_105 = ( + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_175 = 0.5 * hidden_states_106 + pow_22 = torch.pow(hidden_states_106, 3.0) + mul_176 = 0.044715 * pow_22 + pow_22 = None + add_128 = hidden_states_106 + mul_176 + hidden_states_106 = mul_176 = None + mul_177 = 0.7978845608028654 * add_128 + add_128 = None + tanh_21 = torch.tanh(mul_177) + mul_177 = None + add_129 = 1.0 + tanh_21 + tanh_21 = None + hidden_states_107 = mul_175 * add_129 + mul_175 = add_129 = None + hidden_states_108 = torch._C._nn.linear( + hidden_states_107, + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_107 = ( + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_21 = torch.nn.functional.dropout( + hidden_states_108, 0.0, False, False + ) + hidden_states_108 = None + add_130 = attn_outputs_21 + feed_forward_hidden_states_21 + attn_outputs_21 = feed_forward_hidden_states_21 = None + hidden_states_109 = add_130 + hidden_states_104 + add_130 = hidden_states_104 = None + hidden_states_110 = torch.nn.functional.layer_norm( + hidden_states_109, + (2048,), + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_ + ) = None + linear_132 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_67 = linear_132.view((1, 2, -1, 64)) + linear_132 = None + query_states_44 = view_67.transpose(1, 2) + view_67 = None + linear_133 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_68 = linear_133.view((1, 2, -1, 64)) + linear_133 = None + key_states_44 = view_68.transpose(1, 2) + view_68 = None + linear_134 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_69 = linear_134.view((1, 2, -1, 64)) + linear_134 = None + value_states_22 = view_69.transpose(1, 2) + view_69 = None + query_rot_22 = query_states_44[(Ellipsis, slice(None, 32, None))] + query_pass_22 = query_states_44[(Ellipsis, slice(32, None, None))] + query_states_44 = None + key_rot_22 = key_states_44[(Ellipsis, slice(None, 32, None))] + key_pass_22 = key_states_44[(Ellipsis, slice(32, None, None))] + key_states_44 = None + cos_25 = cos_2.unsqueeze(1) + sin_25 = sin_2.unsqueeze(1) + mul_179 = query_rot_22 * cos_25 + x1_44 = query_rot_22[(Ellipsis, slice(None, 16, None))] + x2_44 = query_rot_22[(Ellipsis, slice(16, None, None))] + query_rot_22 = None + neg_44 = -x2_44 + x2_44 = None + cat_89 = torch.cat((neg_44, x1_44), dim=-1) + neg_44 = x1_44 = None + mul_180 = cat_89 * sin_25 + cat_89 = None + q_embed_22 = mul_179 + mul_180 + mul_179 = mul_180 = None + mul_181 = key_rot_22 * cos_25 + cos_25 = None + x1_45 = key_rot_22[(Ellipsis, slice(None, 16, None))] + x2_45 = key_rot_22[(Ellipsis, slice(16, None, None))] + key_rot_22 = None + neg_45 = -x2_45 + x2_45 = None + cat_90 = torch.cat((neg_45, x1_45), dim=-1) + neg_45 = x1_45 = None + mul_182 = cat_90 * sin_25 + cat_90 = sin_25 = None + k_embed_22 = mul_181 + mul_182 + mul_181 = mul_182 = None + query_states_45 = torch.cat((q_embed_22, query_pass_22), dim=-1) + q_embed_22 = query_pass_22 = None + key_states_45 = torch.cat((k_embed_22, key_pass_22), dim=-1) + k_embed_22 = key_pass_22 = None + attention_mask_23 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_22 = query_states_45.contiguous() + query_states_45 = None + key_22 = key_states_45.contiguous() + value_22 = value_states_22.contiguous() + attn_output_88 = torch._C._nn.scaled_dot_product_attention( + query_22, + key_22, + value_22, + attn_mask=attention_mask_23, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_22 = key_22 = value_22 = attention_mask_23 = None + transpose_92 = attn_output_88.transpose(1, 2) + attn_output_88 = None + attn_output_89 = transpose_92.contiguous() + transpose_92 = None + reshape_22 = attn_output_89.reshape(1, 2, -1) + attn_output_89 = None + attn_output_90 = reshape_22.contiguous() + reshape_22 = None + attn_output_91 = torch._C._nn.linear( + attn_output_90, + l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_90 = l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_22 = torch.nn.functional.dropout(attn_output_91, 0.0, False, False) + attn_output_91 = None + hidden_states_111 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_110 = ( + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_183 = 0.5 * hidden_states_111 + pow_23 = torch.pow(hidden_states_111, 3.0) + mul_184 = 0.044715 * pow_23 + pow_23 = None + add_134 = hidden_states_111 + mul_184 + hidden_states_111 = mul_184 = None + mul_185 = 0.7978845608028654 * add_134 + add_134 = None + tanh_22 = torch.tanh(mul_185) + mul_185 = None + add_135 = 1.0 + tanh_22 + tanh_22 = None + hidden_states_112 = mul_183 * add_135 + mul_183 = add_135 = None + hidden_states_113 = torch._C._nn.linear( + hidden_states_112, + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_112 = ( + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_22 = torch.nn.functional.dropout( + hidden_states_113, 0.0, False, False + ) + hidden_states_113 = None + add_136 = attn_outputs_22 + feed_forward_hidden_states_22 + attn_outputs_22 = feed_forward_hidden_states_22 = None + hidden_states_114 = add_136 + hidden_states_109 + add_136 = hidden_states_109 = None + hidden_states_115 = torch.nn.functional.layer_norm( + hidden_states_114, + (2048,), + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_ + ) = None + linear_138 = torch._C._nn.linear( + hidden_states_115, + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_70 = linear_138.view((1, 2, -1, 64)) + linear_138 = None + query_states_46 = view_70.transpose(1, 2) + view_70 = None + linear_139 = torch._C._nn.linear( + hidden_states_115, + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_71 = linear_139.view((1, 2, -1, 64)) + linear_139 = None + key_states_46 = view_71.transpose(1, 2) + view_71 = None + linear_140 = torch._C._nn.linear( + hidden_states_115, + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_72 = linear_140.view((1, 2, -1, 64)) + linear_140 = None + value_states_23 = view_72.transpose(1, 2) + view_72 = None + query_rot_23 = query_states_46[(Ellipsis, slice(None, 32, None))] + query_pass_23 = query_states_46[(Ellipsis, slice(32, None, None))] + query_states_46 = None + key_rot_23 = key_states_46[(Ellipsis, slice(None, 32, None))] + key_pass_23 = key_states_46[(Ellipsis, slice(32, None, None))] + key_states_46 = None + cos_26 = cos_2.unsqueeze(1) + cos_2 = None + sin_26 = sin_2.unsqueeze(1) + sin_2 = None + mul_187 = query_rot_23 * cos_26 + x1_46 = query_rot_23[(Ellipsis, slice(None, 16, None))] + x2_46 = query_rot_23[(Ellipsis, slice(16, None, None))] + query_rot_23 = None + neg_46 = -x2_46 + x2_46 = None + cat_93 = torch.cat((neg_46, x1_46), dim=-1) + neg_46 = x1_46 = None + mul_188 = cat_93 * sin_26 + cat_93 = None + q_embed_23 = mul_187 + mul_188 + mul_187 = mul_188 = None + mul_189 = key_rot_23 * cos_26 + cos_26 = None + x1_47 = key_rot_23[(Ellipsis, slice(None, 16, None))] + x2_47 = key_rot_23[(Ellipsis, slice(16, None, None))] + key_rot_23 = None + neg_47 = -x2_47 + x2_47 = None + cat_94 = torch.cat((neg_47, x1_47), dim=-1) + neg_47 = x1_47 = None + mul_190 = cat_94 * sin_26 + cat_94 = sin_26 = None + k_embed_23 = mul_189 + mul_190 + mul_189 = mul_190 = None + query_states_47 = torch.cat((q_embed_23, query_pass_23), dim=-1) + q_embed_23 = query_pass_23 = None + key_states_47 = torch.cat((k_embed_23, key_pass_23), dim=-1) + k_embed_23 = key_pass_23 = None + attention_mask_24 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + causal_mask_2 = None + query_23 = query_states_47.contiguous() + query_states_47 = None + key_23 = key_states_47.contiguous() + value_23 = value_states_23.contiguous() + attn_output_92 = torch._C._nn.scaled_dot_product_attention( + query_23, + key_23, + value_23, + attn_mask=attention_mask_24, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_23 = key_23 = value_23 = attention_mask_24 = None + transpose_96 = attn_output_92.transpose(1, 2) + attn_output_92 = None + attn_output_93 = transpose_96.contiguous() + transpose_96 = None + reshape_23 = attn_output_93.reshape(1, 2, -1) + attn_output_93 = None + attn_output_94 = reshape_23.contiguous() + reshape_23 = None + attn_output_95 = torch._C._nn.linear( + attn_output_94, + l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_94 = l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_23 = torch.nn.functional.dropout(attn_output_95, 0.0, False, False) + attn_output_95 = None + hidden_states_116 = torch._C._nn.linear( + hidden_states_115, + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_115 = ( + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_191 = 0.5 * hidden_states_116 + pow_24 = torch.pow(hidden_states_116, 3.0) + mul_192 = 0.044715 * pow_24 + pow_24 = None + add_140 = hidden_states_116 + mul_192 + hidden_states_116 = mul_192 = None + mul_193 = 0.7978845608028654 * add_140 + add_140 = None + tanh_23 = torch.tanh(mul_193) + mul_193 = None + add_141 = 1.0 + tanh_23 + tanh_23 = None + hidden_states_117 = mul_191 * add_141 + mul_191 = add_141 = None + hidden_states_118 = torch._C._nn.linear( + hidden_states_117, + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_117 = ( + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_23 = torch.nn.functional.dropout( + hidden_states_118, 0.0, False, False + ) + hidden_states_118 = None + add_142 = attn_outputs_23 + feed_forward_hidden_states_23 + attn_outputs_23 = feed_forward_hidden_states_23 = None + hidden_states_119 = add_142 + hidden_states_114 + add_142 = hidden_states_114 = None + hidden_states_120 = torch.nn.functional.layer_norm( + hidden_states_119, + (2048,), + l_self_modules_final_layernorm_parameters_weight_, + l_self_modules_final_layernorm_parameters_bias_, + 1e-05, + ) + hidden_states_119 = ( + l_self_modules_final_layernorm_parameters_weight_ + ) = l_self_modules_final_layernorm_parameters_bias_ = None + return ( + value_states, + key_states_1, + value_states_1, + key_states_3, + value_states_2, + key_states_5, + value_states_3, + key_states_7, + value_states_4, + key_states_9, + value_states_5, + key_states_11, + value_states_6, + key_states_13, + value_states_7, + key_states_15, + value_states_8, + key_states_17, + value_states_9, + key_states_19, + value_states_10, + key_states_21, + value_states_11, + key_states_23, + value_states_12, + key_states_25, + value_states_13, + key_states_27, + value_states_14, + key_states_29, + value_states_15, + key_states_31, + value_states_16, + key_states_33, + value_states_17, + key_states_35, + value_states_18, + key_states_37, + value_states_19, + key_states_39, + value_states_20, + key_states_41, + value_states_21, + key_states_43, + value_states_22, + key_states_45, + value_states_23, + key_states_47, + hidden_states_120, + ) diff --git a/samples/transformers-auto-model/microsoft/phi-1/weight_meta.py b/samples/transformers-auto-model/microsoft/phi-1/weight_meta.py new file mode 100644 index 000000000..59e37f72b --- /dev/null +++ b/samples/transformers-auto-model/microsoft/phi-1/weight_meta.py @@ -0,0 +1,3425 @@ +class Program_weight_tensor_meta_L_inputs_embeds_: + name = "L_inputs_embeds_" + shape = [1, 2, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 2] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1] + + +class Program_weight_tensor_meta_L_self_modules_rotary_emb_buffers_inv_freq_: + name = "L_self_modules_rotary_emb_buffers_inv_freq_" + shape = [16] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.143 + std = 0.275 + data = [ + 1.000000, + 0.562341, + 0.316228, + 0.177828, + 0.100000, + 0.056234, + 0.031623, + 0.017783, + 0.010000, + 0.005623, + 0.003162, + 0.001778, + 0.001000, + 0.000562, + 0.000316, + 0.000178, + ] + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_final_layernorm_parameters_weight_: + name = "L_self_modules_final_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_final_layernorm_parameters_bias_: + name = "L_self_modules_final_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/microsoft/phi-1_5/graph_hash.txt b/samples/transformers-auto-model/microsoft/phi-1_5/graph_hash.txt new file mode 100644 index 000000000..59541f26d --- /dev/null +++ b/samples/transformers-auto-model/microsoft/phi-1_5/graph_hash.txt @@ -0,0 +1 @@ +29692b7ae99c32cef41877ec65f7519b9b8172beb79c4efcde645497ebd81e06 \ No newline at end of file diff --git a/samples/transformers-auto-model/microsoft/phi-1_5/graph_net.json b/samples/transformers-auto-model/microsoft/phi-1_5/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/phi-1_5/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/microsoft/phi-1_5/input_meta.py b/samples/transformers-auto-model/microsoft/phi-1_5/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/microsoft/phi-1_5/input_tensor_constraints.py b/samples/transformers-auto-model/microsoft/phi-1_5/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/microsoft/phi-1_5/model.py b/samples/transformers-auto-model/microsoft/phi-1_5/model.py new file mode 100644 index 000000000..0bf94fe64 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/phi-1_5/model.py @@ -0,0 +1,4894 @@ +import torch + +from torch import device + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_inputs_embeds_: torch.Tensor, + L_attention_mask_: torch.Tensor, + L_self_modules_rotary_emb_buffers_inv_freq_: torch.Tensor, + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_final_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_final_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_inputs_embeds_ = L_inputs_embeds_ + l_attention_mask_ = L_attention_mask_ + l_self_modules_rotary_emb_buffers_inv_freq_ = ( + L_self_modules_rotary_emb_buffers_inv_freq_ + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_final_layernorm_parameters_weight_ = ( + L_self_modules_final_layernorm_parameters_weight_ + ) + l_self_modules_final_layernorm_parameters_bias_ = ( + L_self_modules_final_layernorm_parameters_bias_ + ) + cache_position = torch.arange(0, 2, device=device(type="cuda", index=0)) + position_ids = cache_position.unsqueeze(0) + attention_mask = l_attention_mask_.to( + device=device(type="cuda", index=0), dtype=torch.bool + ) + l_attention_mask_ = None + mask_indices = torch.arange(2, device=device(type="cuda", index=0)) + mask_indices += 0 + mask_indices_1 = mask_indices + mask_indices = None + local_padding_mask = attention_mask[(slice(None, None, None), mask_indices_1)] + attention_mask = mask_indices_1 = None + kv_arange = torch.arange(2, device=device(type="cuda", index=0)) + kv_arange += 0 + kv_arange_1 = kv_arange + kv_arange = None + reshaped_cache_position = cache_position.view(-1, 1) + cache_position = None + causal_mask = kv_arange_1 <= reshaped_cache_position + kv_arange_1 = reshaped_cache_position = None + getitem_1 = causal_mask[ + (None, None, slice(None, None, None), slice(None, None, None)) + ] + causal_mask = None + causal_mask_1 = getitem_1.expand(1, -1, -1, -1) + getitem_1 = None + getitem_2 = local_padding_mask[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + local_padding_mask = None + causal_mask_2 = causal_mask_1 * getitem_2 + causal_mask_1 = getitem_2 = None + inputs_embeds = torch.nn.functional.dropout(l_inputs_embeds_, 0.0, False, False) + l_inputs_embeds_ = None + _set_grad_enabled = torch._C._set_grad_enabled(False) + _set_grad_enabled = None + getitem_3 = l_self_modules_rotary_emb_buffers_inv_freq_[ + (None, slice(None, None, None), None) + ] + l_self_modules_rotary_emb_buffers_inv_freq_ = None + float_1 = getitem_3.float() + getitem_3 = None + expand_1 = float_1.expand(1, -1, 1) + float_1 = None + inv_freq_expanded = expand_1.to(device(type="cuda", index=0)) + expand_1 = None + getitem_4 = position_ids[ + (slice(None, None, None), None, slice(None, None, None)) + ] + position_ids = None + position_ids_expanded = getitem_4.float() + getitem_4 = None + float_3 = inv_freq_expanded.float() + inv_freq_expanded = None + float_4 = position_ids_expanded.float() + position_ids_expanded = None + matmul = float_3 @ float_4 + float_3 = float_4 = None + freqs = matmul.transpose(1, 2) + matmul = None + emb = torch.cat((freqs, freqs), dim=-1) + freqs = None + cos = emb.cos() + cos_1 = cos * 1.0 + cos = None + sin = emb.sin() + emb = None + sin_1 = sin * 1.0 + sin = None + cos_2 = cos_1.to(dtype=torch.float16) + cos_1 = None + sin_2 = sin_1.to(dtype=torch.float16) + sin_1 = None + _set_grad_enabled_1 = torch._C._set_grad_enabled(True) + _set_grad_enabled_1 = None + _log_api_usage_once = torch._C._log_api_usage_once("python.nn_module") + _log_api_usage_once = None + hidden_states = torch.nn.functional.layer_norm( + inputs_embeds, + (2048,), + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_ + ) = None + linear = torch._C._nn.linear( + hidden_states, + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_1 = linear.view((1, 2, -1, 64)) + linear = None + query_states = view_1.transpose(1, 2) + view_1 = None + linear_1 = torch._C._nn.linear( + hidden_states, + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_2 = linear_1.view((1, 2, -1, 64)) + linear_1 = None + key_states = view_2.transpose(1, 2) + view_2 = None + linear_2 = torch._C._nn.linear( + hidden_states, + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_3 = linear_2.view((1, 2, -1, 64)) + linear_2 = None + value_states = view_3.transpose(1, 2) + view_3 = None + query_rot = query_states[(Ellipsis, slice(None, 32, None))] + query_pass = query_states[(Ellipsis, slice(32, None, None))] + query_states = None + key_rot = key_states[(Ellipsis, slice(None, 32, None))] + key_pass = key_states[(Ellipsis, slice(32, None, None))] + key_states = None + cos_3 = cos_2.unsqueeze(1) + sin_3 = sin_2.unsqueeze(1) + mul_3 = query_rot * cos_3 + x1 = query_rot[(Ellipsis, slice(None, 16, None))] + x2 = query_rot[(Ellipsis, slice(16, None, None))] + query_rot = None + neg = -x2 + x2 = None + cat_1 = torch.cat((neg, x1), dim=-1) + neg = x1 = None + mul_4 = cat_1 * sin_3 + cat_1 = None + q_embed = mul_3 + mul_4 + mul_3 = mul_4 = None + mul_5 = key_rot * cos_3 + cos_3 = None + x1_1 = key_rot[(Ellipsis, slice(None, 16, None))] + x2_1 = key_rot[(Ellipsis, slice(16, None, None))] + key_rot = None + neg_1 = -x2_1 + x2_1 = None + cat_2 = torch.cat((neg_1, x1_1), dim=-1) + neg_1 = x1_1 = None + mul_6 = cat_2 * sin_3 + cat_2 = sin_3 = None + k_embed = mul_5 + mul_6 + mul_5 = mul_6 = None + query_states_1 = torch.cat((q_embed, query_pass), dim=-1) + q_embed = query_pass = None + key_states_1 = torch.cat((k_embed, key_pass), dim=-1) + k_embed = key_pass = None + attention_mask_1 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query = query_states_1.contiguous() + query_states_1 = None + key = key_states_1.contiguous() + value = value_states.contiguous() + attn_output = torch._C._nn.scaled_dot_product_attention( + query, + key, + value, + attn_mask=attention_mask_1, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query = key = value = attention_mask_1 = None + transpose_4 = attn_output.transpose(1, 2) + attn_output = None + attn_output_1 = transpose_4.contiguous() + transpose_4 = None + reshape = attn_output_1.reshape(1, 2, -1) + attn_output_1 = None + attn_output_2 = reshape.contiguous() + reshape = None + attn_output_3 = torch._C._nn.linear( + attn_output_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_2 = l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs = torch.nn.functional.dropout(attn_output_3, 0.0, False, False) + attn_output_3 = None + hidden_states_1 = torch._C._nn.linear( + hidden_states, + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states = ( + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_7 = 0.5 * hidden_states_1 + pow_1 = torch.pow(hidden_states_1, 3.0) + mul_8 = 0.044715 * pow_1 + pow_1 = None + add_2 = hidden_states_1 + mul_8 + hidden_states_1 = mul_8 = None + mul_9 = 0.7978845608028654 * add_2 + add_2 = None + tanh = torch.tanh(mul_9) + mul_9 = None + add_3 = 1.0 + tanh + tanh = None + hidden_states_2 = mul_7 * add_3 + mul_7 = add_3 = None + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_2 = ( + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states = torch.nn.functional.dropout( + hidden_states_3, 0.0, False, False + ) + hidden_states_3 = None + add_4 = attn_outputs + feed_forward_hidden_states + attn_outputs = feed_forward_hidden_states = None + hidden_states_4 = add_4 + inputs_embeds + add_4 = inputs_embeds = None + hidden_states_5 = torch.nn.functional.layer_norm( + hidden_states_4, + (2048,), + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_ + ) = None + linear_6 = torch._C._nn.linear( + hidden_states_5, + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_4 = linear_6.view((1, 2, -1, 64)) + linear_6 = None + query_states_2 = view_4.transpose(1, 2) + view_4 = None + linear_7 = torch._C._nn.linear( + hidden_states_5, + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_5 = linear_7.view((1, 2, -1, 64)) + linear_7 = None + key_states_2 = view_5.transpose(1, 2) + view_5 = None + linear_8 = torch._C._nn.linear( + hidden_states_5, + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_6 = linear_8.view((1, 2, -1, 64)) + linear_8 = None + value_states_1 = view_6.transpose(1, 2) + view_6 = None + query_rot_1 = query_states_2[(Ellipsis, slice(None, 32, None))] + query_pass_1 = query_states_2[(Ellipsis, slice(32, None, None))] + query_states_2 = None + key_rot_1 = key_states_2[(Ellipsis, slice(None, 32, None))] + key_pass_1 = key_states_2[(Ellipsis, slice(32, None, None))] + key_states_2 = None + cos_4 = cos_2.unsqueeze(1) + sin_4 = sin_2.unsqueeze(1) + mul_11 = query_rot_1 * cos_4 + x1_2 = query_rot_1[(Ellipsis, slice(None, 16, None))] + x2_2 = query_rot_1[(Ellipsis, slice(16, None, None))] + query_rot_1 = None + neg_2 = -x2_2 + x2_2 = None + cat_5 = torch.cat((neg_2, x1_2), dim=-1) + neg_2 = x1_2 = None + mul_12 = cat_5 * sin_4 + cat_5 = None + q_embed_1 = mul_11 + mul_12 + mul_11 = mul_12 = None + mul_13 = key_rot_1 * cos_4 + cos_4 = None + x1_3 = key_rot_1[(Ellipsis, slice(None, 16, None))] + x2_3 = key_rot_1[(Ellipsis, slice(16, None, None))] + key_rot_1 = None + neg_3 = -x2_3 + x2_3 = None + cat_6 = torch.cat((neg_3, x1_3), dim=-1) + neg_3 = x1_3 = None + mul_14 = cat_6 * sin_4 + cat_6 = sin_4 = None + k_embed_1 = mul_13 + mul_14 + mul_13 = mul_14 = None + query_states_3 = torch.cat((q_embed_1, query_pass_1), dim=-1) + q_embed_1 = query_pass_1 = None + key_states_3 = torch.cat((k_embed_1, key_pass_1), dim=-1) + k_embed_1 = key_pass_1 = None + attention_mask_2 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_1 = query_states_3.contiguous() + query_states_3 = None + key_1 = key_states_3.contiguous() + value_1 = value_states_1.contiguous() + attn_output_4 = torch._C._nn.scaled_dot_product_attention( + query_1, + key_1, + value_1, + attn_mask=attention_mask_2, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_1 = key_1 = value_1 = attention_mask_2 = None + transpose_8 = attn_output_4.transpose(1, 2) + attn_output_4 = None + attn_output_5 = transpose_8.contiguous() + transpose_8 = None + reshape_1 = attn_output_5.reshape(1, 2, -1) + attn_output_5 = None + attn_output_6 = reshape_1.contiguous() + reshape_1 = None + attn_output_7 = torch._C._nn.linear( + attn_output_6, + l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_6 = l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_1 = torch.nn.functional.dropout(attn_output_7, 0.0, False, False) + attn_output_7 = None + hidden_states_6 = torch._C._nn.linear( + hidden_states_5, + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_5 = ( + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_15 = 0.5 * hidden_states_6 + pow_2 = torch.pow(hidden_states_6, 3.0) + mul_16 = 0.044715 * pow_2 + pow_2 = None + add_8 = hidden_states_6 + mul_16 + hidden_states_6 = mul_16 = None + mul_17 = 0.7978845608028654 * add_8 + add_8 = None + tanh_1 = torch.tanh(mul_17) + mul_17 = None + add_9 = 1.0 + tanh_1 + tanh_1 = None + hidden_states_7 = mul_15 * add_9 + mul_15 = add_9 = None + hidden_states_8 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_7 = ( + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_1 = torch.nn.functional.dropout( + hidden_states_8, 0.0, False, False + ) + hidden_states_8 = None + add_10 = attn_outputs_1 + feed_forward_hidden_states_1 + attn_outputs_1 = feed_forward_hidden_states_1 = None + hidden_states_9 = add_10 + hidden_states_4 + add_10 = hidden_states_4 = None + hidden_states_10 = torch.nn.functional.layer_norm( + hidden_states_9, + (2048,), + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_ + ) = None + linear_12 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_7 = linear_12.view((1, 2, -1, 64)) + linear_12 = None + query_states_4 = view_7.transpose(1, 2) + view_7 = None + linear_13 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_8 = linear_13.view((1, 2, -1, 64)) + linear_13 = None + key_states_4 = view_8.transpose(1, 2) + view_8 = None + linear_14 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_9 = linear_14.view((1, 2, -1, 64)) + linear_14 = None + value_states_2 = view_9.transpose(1, 2) + view_9 = None + query_rot_2 = query_states_4[(Ellipsis, slice(None, 32, None))] + query_pass_2 = query_states_4[(Ellipsis, slice(32, None, None))] + query_states_4 = None + key_rot_2 = key_states_4[(Ellipsis, slice(None, 32, None))] + key_pass_2 = key_states_4[(Ellipsis, slice(32, None, None))] + key_states_4 = None + cos_5 = cos_2.unsqueeze(1) + sin_5 = sin_2.unsqueeze(1) + mul_19 = query_rot_2 * cos_5 + x1_4 = query_rot_2[(Ellipsis, slice(None, 16, None))] + x2_4 = query_rot_2[(Ellipsis, slice(16, None, None))] + query_rot_2 = None + neg_4 = -x2_4 + x2_4 = None + cat_9 = torch.cat((neg_4, x1_4), dim=-1) + neg_4 = x1_4 = None + mul_20 = cat_9 * sin_5 + cat_9 = None + q_embed_2 = mul_19 + mul_20 + mul_19 = mul_20 = None + mul_21 = key_rot_2 * cos_5 + cos_5 = None + x1_5 = key_rot_2[(Ellipsis, slice(None, 16, None))] + x2_5 = key_rot_2[(Ellipsis, slice(16, None, None))] + key_rot_2 = None + neg_5 = -x2_5 + x2_5 = None + cat_10 = torch.cat((neg_5, x1_5), dim=-1) + neg_5 = x1_5 = None + mul_22 = cat_10 * sin_5 + cat_10 = sin_5 = None + k_embed_2 = mul_21 + mul_22 + mul_21 = mul_22 = None + query_states_5 = torch.cat((q_embed_2, query_pass_2), dim=-1) + q_embed_2 = query_pass_2 = None + key_states_5 = torch.cat((k_embed_2, key_pass_2), dim=-1) + k_embed_2 = key_pass_2 = None + attention_mask_3 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_2 = query_states_5.contiguous() + query_states_5 = None + key_2 = key_states_5.contiguous() + value_2 = value_states_2.contiguous() + attn_output_8 = torch._C._nn.scaled_dot_product_attention( + query_2, + key_2, + value_2, + attn_mask=attention_mask_3, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_2 = key_2 = value_2 = attention_mask_3 = None + transpose_12 = attn_output_8.transpose(1, 2) + attn_output_8 = None + attn_output_9 = transpose_12.contiguous() + transpose_12 = None + reshape_2 = attn_output_9.reshape(1, 2, -1) + attn_output_9 = None + attn_output_10 = reshape_2.contiguous() + reshape_2 = None + attn_output_11 = torch._C._nn.linear( + attn_output_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_10 = l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_2 = torch.nn.functional.dropout(attn_output_11, 0.0, False, False) + attn_output_11 = None + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_10 = ( + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_23 = 0.5 * hidden_states_11 + pow_3 = torch.pow(hidden_states_11, 3.0) + mul_24 = 0.044715 * pow_3 + pow_3 = None + add_14 = hidden_states_11 + mul_24 + hidden_states_11 = mul_24 = None + mul_25 = 0.7978845608028654 * add_14 + add_14 = None + tanh_2 = torch.tanh(mul_25) + mul_25 = None + add_15 = 1.0 + tanh_2 + tanh_2 = None + hidden_states_12 = mul_23 * add_15 + mul_23 = add_15 = None + hidden_states_13 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_12 = ( + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_2 = torch.nn.functional.dropout( + hidden_states_13, 0.0, False, False + ) + hidden_states_13 = None + add_16 = attn_outputs_2 + feed_forward_hidden_states_2 + attn_outputs_2 = feed_forward_hidden_states_2 = None + hidden_states_14 = add_16 + hidden_states_9 + add_16 = hidden_states_9 = None + hidden_states_15 = torch.nn.functional.layer_norm( + hidden_states_14, + (2048,), + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_ + ) = None + linear_18 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_10 = linear_18.view((1, 2, -1, 64)) + linear_18 = None + query_states_6 = view_10.transpose(1, 2) + view_10 = None + linear_19 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_11 = linear_19.view((1, 2, -1, 64)) + linear_19 = None + key_states_6 = view_11.transpose(1, 2) + view_11 = None + linear_20 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_12 = linear_20.view((1, 2, -1, 64)) + linear_20 = None + value_states_3 = view_12.transpose(1, 2) + view_12 = None + query_rot_3 = query_states_6[(Ellipsis, slice(None, 32, None))] + query_pass_3 = query_states_6[(Ellipsis, slice(32, None, None))] + query_states_6 = None + key_rot_3 = key_states_6[(Ellipsis, slice(None, 32, None))] + key_pass_3 = key_states_6[(Ellipsis, slice(32, None, None))] + key_states_6 = None + cos_6 = cos_2.unsqueeze(1) + sin_6 = sin_2.unsqueeze(1) + mul_27 = query_rot_3 * cos_6 + x1_6 = query_rot_3[(Ellipsis, slice(None, 16, None))] + x2_6 = query_rot_3[(Ellipsis, slice(16, None, None))] + query_rot_3 = None + neg_6 = -x2_6 + x2_6 = None + cat_13 = torch.cat((neg_6, x1_6), dim=-1) + neg_6 = x1_6 = None + mul_28 = cat_13 * sin_6 + cat_13 = None + q_embed_3 = mul_27 + mul_28 + mul_27 = mul_28 = None + mul_29 = key_rot_3 * cos_6 + cos_6 = None + x1_7 = key_rot_3[(Ellipsis, slice(None, 16, None))] + x2_7 = key_rot_3[(Ellipsis, slice(16, None, None))] + key_rot_3 = None + neg_7 = -x2_7 + x2_7 = None + cat_14 = torch.cat((neg_7, x1_7), dim=-1) + neg_7 = x1_7 = None + mul_30 = cat_14 * sin_6 + cat_14 = sin_6 = None + k_embed_3 = mul_29 + mul_30 + mul_29 = mul_30 = None + query_states_7 = torch.cat((q_embed_3, query_pass_3), dim=-1) + q_embed_3 = query_pass_3 = None + key_states_7 = torch.cat((k_embed_3, key_pass_3), dim=-1) + k_embed_3 = key_pass_3 = None + attention_mask_4 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_3 = query_states_7.contiguous() + query_states_7 = None + key_3 = key_states_7.contiguous() + value_3 = value_states_3.contiguous() + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_3, + key_3, + value_3, + attn_mask=attention_mask_4, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_3 = key_3 = value_3 = attention_mask_4 = None + transpose_16 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_13 = transpose_16.contiguous() + transpose_16 = None + reshape_3 = attn_output_13.reshape(1, 2, -1) + attn_output_13 = None + attn_output_14 = reshape_3.contiguous() + reshape_3 = None + attn_output_15 = torch._C._nn.linear( + attn_output_14, + l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_14 = l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_3 = torch.nn.functional.dropout(attn_output_15, 0.0, False, False) + attn_output_15 = None + hidden_states_16 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_15 = ( + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_31 = 0.5 * hidden_states_16 + pow_4 = torch.pow(hidden_states_16, 3.0) + mul_32 = 0.044715 * pow_4 + pow_4 = None + add_20 = hidden_states_16 + mul_32 + hidden_states_16 = mul_32 = None + mul_33 = 0.7978845608028654 * add_20 + add_20 = None + tanh_3 = torch.tanh(mul_33) + mul_33 = None + add_21 = 1.0 + tanh_3 + tanh_3 = None + hidden_states_17 = mul_31 * add_21 + mul_31 = add_21 = None + hidden_states_18 = torch._C._nn.linear( + hidden_states_17, + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_17 = ( + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_3 = torch.nn.functional.dropout( + hidden_states_18, 0.0, False, False + ) + hidden_states_18 = None + add_22 = attn_outputs_3 + feed_forward_hidden_states_3 + attn_outputs_3 = feed_forward_hidden_states_3 = None + hidden_states_19 = add_22 + hidden_states_14 + add_22 = hidden_states_14 = None + hidden_states_20 = torch.nn.functional.layer_norm( + hidden_states_19, + (2048,), + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_ + ) = None + linear_24 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_13 = linear_24.view((1, 2, -1, 64)) + linear_24 = None + query_states_8 = view_13.transpose(1, 2) + view_13 = None + linear_25 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_14 = linear_25.view((1, 2, -1, 64)) + linear_25 = None + key_states_8 = view_14.transpose(1, 2) + view_14 = None + linear_26 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_15 = linear_26.view((1, 2, -1, 64)) + linear_26 = None + value_states_4 = view_15.transpose(1, 2) + view_15 = None + query_rot_4 = query_states_8[(Ellipsis, slice(None, 32, None))] + query_pass_4 = query_states_8[(Ellipsis, slice(32, None, None))] + query_states_8 = None + key_rot_4 = key_states_8[(Ellipsis, slice(None, 32, None))] + key_pass_4 = key_states_8[(Ellipsis, slice(32, None, None))] + key_states_8 = None + cos_7 = cos_2.unsqueeze(1) + sin_7 = sin_2.unsqueeze(1) + mul_35 = query_rot_4 * cos_7 + x1_8 = query_rot_4[(Ellipsis, slice(None, 16, None))] + x2_8 = query_rot_4[(Ellipsis, slice(16, None, None))] + query_rot_4 = None + neg_8 = -x2_8 + x2_8 = None + cat_17 = torch.cat((neg_8, x1_8), dim=-1) + neg_8 = x1_8 = None + mul_36 = cat_17 * sin_7 + cat_17 = None + q_embed_4 = mul_35 + mul_36 + mul_35 = mul_36 = None + mul_37 = key_rot_4 * cos_7 + cos_7 = None + x1_9 = key_rot_4[(Ellipsis, slice(None, 16, None))] + x2_9 = key_rot_4[(Ellipsis, slice(16, None, None))] + key_rot_4 = None + neg_9 = -x2_9 + x2_9 = None + cat_18 = torch.cat((neg_9, x1_9), dim=-1) + neg_9 = x1_9 = None + mul_38 = cat_18 * sin_7 + cat_18 = sin_7 = None + k_embed_4 = mul_37 + mul_38 + mul_37 = mul_38 = None + query_states_9 = torch.cat((q_embed_4, query_pass_4), dim=-1) + q_embed_4 = query_pass_4 = None + key_states_9 = torch.cat((k_embed_4, key_pass_4), dim=-1) + k_embed_4 = key_pass_4 = None + attention_mask_5 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_4 = query_states_9.contiguous() + query_states_9 = None + key_4 = key_states_9.contiguous() + value_4 = value_states_4.contiguous() + attn_output_16 = torch._C._nn.scaled_dot_product_attention( + query_4, + key_4, + value_4, + attn_mask=attention_mask_5, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_4 = key_4 = value_4 = attention_mask_5 = None + transpose_20 = attn_output_16.transpose(1, 2) + attn_output_16 = None + attn_output_17 = transpose_20.contiguous() + transpose_20 = None + reshape_4 = attn_output_17.reshape(1, 2, -1) + attn_output_17 = None + attn_output_18 = reshape_4.contiguous() + reshape_4 = None + attn_output_19 = torch._C._nn.linear( + attn_output_18, + l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_18 = l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_4 = torch.nn.functional.dropout(attn_output_19, 0.0, False, False) + attn_output_19 = None + hidden_states_21 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_20 = ( + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_39 = 0.5 * hidden_states_21 + pow_5 = torch.pow(hidden_states_21, 3.0) + mul_40 = 0.044715 * pow_5 + pow_5 = None + add_26 = hidden_states_21 + mul_40 + hidden_states_21 = mul_40 = None + mul_41 = 0.7978845608028654 * add_26 + add_26 = None + tanh_4 = torch.tanh(mul_41) + mul_41 = None + add_27 = 1.0 + tanh_4 + tanh_4 = None + hidden_states_22 = mul_39 * add_27 + mul_39 = add_27 = None + hidden_states_23 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_22 = ( + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_4 = torch.nn.functional.dropout( + hidden_states_23, 0.0, False, False + ) + hidden_states_23 = None + add_28 = attn_outputs_4 + feed_forward_hidden_states_4 + attn_outputs_4 = feed_forward_hidden_states_4 = None + hidden_states_24 = add_28 + hidden_states_19 + add_28 = hidden_states_19 = None + hidden_states_25 = torch.nn.functional.layer_norm( + hidden_states_24, + (2048,), + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_ + ) = None + linear_30 = torch._C._nn.linear( + hidden_states_25, + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_16 = linear_30.view((1, 2, -1, 64)) + linear_30 = None + query_states_10 = view_16.transpose(1, 2) + view_16 = None + linear_31 = torch._C._nn.linear( + hidden_states_25, + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_17 = linear_31.view((1, 2, -1, 64)) + linear_31 = None + key_states_10 = view_17.transpose(1, 2) + view_17 = None + linear_32 = torch._C._nn.linear( + hidden_states_25, + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_18 = linear_32.view((1, 2, -1, 64)) + linear_32 = None + value_states_5 = view_18.transpose(1, 2) + view_18 = None + query_rot_5 = query_states_10[(Ellipsis, slice(None, 32, None))] + query_pass_5 = query_states_10[(Ellipsis, slice(32, None, None))] + query_states_10 = None + key_rot_5 = key_states_10[(Ellipsis, slice(None, 32, None))] + key_pass_5 = key_states_10[(Ellipsis, slice(32, None, None))] + key_states_10 = None + cos_8 = cos_2.unsqueeze(1) + sin_8 = sin_2.unsqueeze(1) + mul_43 = query_rot_5 * cos_8 + x1_10 = query_rot_5[(Ellipsis, slice(None, 16, None))] + x2_10 = query_rot_5[(Ellipsis, slice(16, None, None))] + query_rot_5 = None + neg_10 = -x2_10 + x2_10 = None + cat_21 = torch.cat((neg_10, x1_10), dim=-1) + neg_10 = x1_10 = None + mul_44 = cat_21 * sin_8 + cat_21 = None + q_embed_5 = mul_43 + mul_44 + mul_43 = mul_44 = None + mul_45 = key_rot_5 * cos_8 + cos_8 = None + x1_11 = key_rot_5[(Ellipsis, slice(None, 16, None))] + x2_11 = key_rot_5[(Ellipsis, slice(16, None, None))] + key_rot_5 = None + neg_11 = -x2_11 + x2_11 = None + cat_22 = torch.cat((neg_11, x1_11), dim=-1) + neg_11 = x1_11 = None + mul_46 = cat_22 * sin_8 + cat_22 = sin_8 = None + k_embed_5 = mul_45 + mul_46 + mul_45 = mul_46 = None + query_states_11 = torch.cat((q_embed_5, query_pass_5), dim=-1) + q_embed_5 = query_pass_5 = None + key_states_11 = torch.cat((k_embed_5, key_pass_5), dim=-1) + k_embed_5 = key_pass_5 = None + attention_mask_6 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_5 = query_states_11.contiguous() + query_states_11 = None + key_5 = key_states_11.contiguous() + value_5 = value_states_5.contiguous() + attn_output_20 = torch._C._nn.scaled_dot_product_attention( + query_5, + key_5, + value_5, + attn_mask=attention_mask_6, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_5 = key_5 = value_5 = attention_mask_6 = None + transpose_24 = attn_output_20.transpose(1, 2) + attn_output_20 = None + attn_output_21 = transpose_24.contiguous() + transpose_24 = None + reshape_5 = attn_output_21.reshape(1, 2, -1) + attn_output_21 = None + attn_output_22 = reshape_5.contiguous() + reshape_5 = None + attn_output_23 = torch._C._nn.linear( + attn_output_22, + l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_22 = l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_5 = torch.nn.functional.dropout(attn_output_23, 0.0, False, False) + attn_output_23 = None + hidden_states_26 = torch._C._nn.linear( + hidden_states_25, + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_25 = ( + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_47 = 0.5 * hidden_states_26 + pow_6 = torch.pow(hidden_states_26, 3.0) + mul_48 = 0.044715 * pow_6 + pow_6 = None + add_32 = hidden_states_26 + mul_48 + hidden_states_26 = mul_48 = None + mul_49 = 0.7978845608028654 * add_32 + add_32 = None + tanh_5 = torch.tanh(mul_49) + mul_49 = None + add_33 = 1.0 + tanh_5 + tanh_5 = None + hidden_states_27 = mul_47 * add_33 + mul_47 = add_33 = None + hidden_states_28 = torch._C._nn.linear( + hidden_states_27, + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_27 = ( + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_5 = torch.nn.functional.dropout( + hidden_states_28, 0.0, False, False + ) + hidden_states_28 = None + add_34 = attn_outputs_5 + feed_forward_hidden_states_5 + attn_outputs_5 = feed_forward_hidden_states_5 = None + hidden_states_29 = add_34 + hidden_states_24 + add_34 = hidden_states_24 = None + hidden_states_30 = torch.nn.functional.layer_norm( + hidden_states_29, + (2048,), + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_ + ) = None + linear_36 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_19 = linear_36.view((1, 2, -1, 64)) + linear_36 = None + query_states_12 = view_19.transpose(1, 2) + view_19 = None + linear_37 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_20 = linear_37.view((1, 2, -1, 64)) + linear_37 = None + key_states_12 = view_20.transpose(1, 2) + view_20 = None + linear_38 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_21 = linear_38.view((1, 2, -1, 64)) + linear_38 = None + value_states_6 = view_21.transpose(1, 2) + view_21 = None + query_rot_6 = query_states_12[(Ellipsis, slice(None, 32, None))] + query_pass_6 = query_states_12[(Ellipsis, slice(32, None, None))] + query_states_12 = None + key_rot_6 = key_states_12[(Ellipsis, slice(None, 32, None))] + key_pass_6 = key_states_12[(Ellipsis, slice(32, None, None))] + key_states_12 = None + cos_9 = cos_2.unsqueeze(1) + sin_9 = sin_2.unsqueeze(1) + mul_51 = query_rot_6 * cos_9 + x1_12 = query_rot_6[(Ellipsis, slice(None, 16, None))] + x2_12 = query_rot_6[(Ellipsis, slice(16, None, None))] + query_rot_6 = None + neg_12 = -x2_12 + x2_12 = None + cat_25 = torch.cat((neg_12, x1_12), dim=-1) + neg_12 = x1_12 = None + mul_52 = cat_25 * sin_9 + cat_25 = None + q_embed_6 = mul_51 + mul_52 + mul_51 = mul_52 = None + mul_53 = key_rot_6 * cos_9 + cos_9 = None + x1_13 = key_rot_6[(Ellipsis, slice(None, 16, None))] + x2_13 = key_rot_6[(Ellipsis, slice(16, None, None))] + key_rot_6 = None + neg_13 = -x2_13 + x2_13 = None + cat_26 = torch.cat((neg_13, x1_13), dim=-1) + neg_13 = x1_13 = None + mul_54 = cat_26 * sin_9 + cat_26 = sin_9 = None + k_embed_6 = mul_53 + mul_54 + mul_53 = mul_54 = None + query_states_13 = torch.cat((q_embed_6, query_pass_6), dim=-1) + q_embed_6 = query_pass_6 = None + key_states_13 = torch.cat((k_embed_6, key_pass_6), dim=-1) + k_embed_6 = key_pass_6 = None + attention_mask_7 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_6 = query_states_13.contiguous() + query_states_13 = None + key_6 = key_states_13.contiguous() + value_6 = value_states_6.contiguous() + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + query_6, + key_6, + value_6, + attn_mask=attention_mask_7, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_6 = key_6 = value_6 = attention_mask_7 = None + transpose_28 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_25 = transpose_28.contiguous() + transpose_28 = None + reshape_6 = attn_output_25.reshape(1, 2, -1) + attn_output_25 = None + attn_output_26 = reshape_6.contiguous() + reshape_6 = None + attn_output_27 = torch._C._nn.linear( + attn_output_26, + l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_26 = l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_6 = torch.nn.functional.dropout(attn_output_27, 0.0, False, False) + attn_output_27 = None + hidden_states_31 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_30 = ( + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_55 = 0.5 * hidden_states_31 + pow_7 = torch.pow(hidden_states_31, 3.0) + mul_56 = 0.044715 * pow_7 + pow_7 = None + add_38 = hidden_states_31 + mul_56 + hidden_states_31 = mul_56 = None + mul_57 = 0.7978845608028654 * add_38 + add_38 = None + tanh_6 = torch.tanh(mul_57) + mul_57 = None + add_39 = 1.0 + tanh_6 + tanh_6 = None + hidden_states_32 = mul_55 * add_39 + mul_55 = add_39 = None + hidden_states_33 = torch._C._nn.linear( + hidden_states_32, + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_32 = ( + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_6 = torch.nn.functional.dropout( + hidden_states_33, 0.0, False, False + ) + hidden_states_33 = None + add_40 = attn_outputs_6 + feed_forward_hidden_states_6 + attn_outputs_6 = feed_forward_hidden_states_6 = None + hidden_states_34 = add_40 + hidden_states_29 + add_40 = hidden_states_29 = None + hidden_states_35 = torch.nn.functional.layer_norm( + hidden_states_34, + (2048,), + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_ + ) = None + linear_42 = torch._C._nn.linear( + hidden_states_35, + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_22 = linear_42.view((1, 2, -1, 64)) + linear_42 = None + query_states_14 = view_22.transpose(1, 2) + view_22 = None + linear_43 = torch._C._nn.linear( + hidden_states_35, + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_23 = linear_43.view((1, 2, -1, 64)) + linear_43 = None + key_states_14 = view_23.transpose(1, 2) + view_23 = None + linear_44 = torch._C._nn.linear( + hidden_states_35, + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_24 = linear_44.view((1, 2, -1, 64)) + linear_44 = None + value_states_7 = view_24.transpose(1, 2) + view_24 = None + query_rot_7 = query_states_14[(Ellipsis, slice(None, 32, None))] + query_pass_7 = query_states_14[(Ellipsis, slice(32, None, None))] + query_states_14 = None + key_rot_7 = key_states_14[(Ellipsis, slice(None, 32, None))] + key_pass_7 = key_states_14[(Ellipsis, slice(32, None, None))] + key_states_14 = None + cos_10 = cos_2.unsqueeze(1) + sin_10 = sin_2.unsqueeze(1) + mul_59 = query_rot_7 * cos_10 + x1_14 = query_rot_7[(Ellipsis, slice(None, 16, None))] + x2_14 = query_rot_7[(Ellipsis, slice(16, None, None))] + query_rot_7 = None + neg_14 = -x2_14 + x2_14 = None + cat_29 = torch.cat((neg_14, x1_14), dim=-1) + neg_14 = x1_14 = None + mul_60 = cat_29 * sin_10 + cat_29 = None + q_embed_7 = mul_59 + mul_60 + mul_59 = mul_60 = None + mul_61 = key_rot_7 * cos_10 + cos_10 = None + x1_15 = key_rot_7[(Ellipsis, slice(None, 16, None))] + x2_15 = key_rot_7[(Ellipsis, slice(16, None, None))] + key_rot_7 = None + neg_15 = -x2_15 + x2_15 = None + cat_30 = torch.cat((neg_15, x1_15), dim=-1) + neg_15 = x1_15 = None + mul_62 = cat_30 * sin_10 + cat_30 = sin_10 = None + k_embed_7 = mul_61 + mul_62 + mul_61 = mul_62 = None + query_states_15 = torch.cat((q_embed_7, query_pass_7), dim=-1) + q_embed_7 = query_pass_7 = None + key_states_15 = torch.cat((k_embed_7, key_pass_7), dim=-1) + k_embed_7 = key_pass_7 = None + attention_mask_8 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_7 = query_states_15.contiguous() + query_states_15 = None + key_7 = key_states_15.contiguous() + value_7 = value_states_7.contiguous() + attn_output_28 = torch._C._nn.scaled_dot_product_attention( + query_7, + key_7, + value_7, + attn_mask=attention_mask_8, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_7 = key_7 = value_7 = attention_mask_8 = None + transpose_32 = attn_output_28.transpose(1, 2) + attn_output_28 = None + attn_output_29 = transpose_32.contiguous() + transpose_32 = None + reshape_7 = attn_output_29.reshape(1, 2, -1) + attn_output_29 = None + attn_output_30 = reshape_7.contiguous() + reshape_7 = None + attn_output_31 = torch._C._nn.linear( + attn_output_30, + l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_30 = l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_7 = torch.nn.functional.dropout(attn_output_31, 0.0, False, False) + attn_output_31 = None + hidden_states_36 = torch._C._nn.linear( + hidden_states_35, + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_35 = ( + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_63 = 0.5 * hidden_states_36 + pow_8 = torch.pow(hidden_states_36, 3.0) + mul_64 = 0.044715 * pow_8 + pow_8 = None + add_44 = hidden_states_36 + mul_64 + hidden_states_36 = mul_64 = None + mul_65 = 0.7978845608028654 * add_44 + add_44 = None + tanh_7 = torch.tanh(mul_65) + mul_65 = None + add_45 = 1.0 + tanh_7 + tanh_7 = None + hidden_states_37 = mul_63 * add_45 + mul_63 = add_45 = None + hidden_states_38 = torch._C._nn.linear( + hidden_states_37, + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_37 = ( + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_7 = torch.nn.functional.dropout( + hidden_states_38, 0.0, False, False + ) + hidden_states_38 = None + add_46 = attn_outputs_7 + feed_forward_hidden_states_7 + attn_outputs_7 = feed_forward_hidden_states_7 = None + hidden_states_39 = add_46 + hidden_states_34 + add_46 = hidden_states_34 = None + hidden_states_40 = torch.nn.functional.layer_norm( + hidden_states_39, + (2048,), + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_ + ) = None + linear_48 = torch._C._nn.linear( + hidden_states_40, + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_25 = linear_48.view((1, 2, -1, 64)) + linear_48 = None + query_states_16 = view_25.transpose(1, 2) + view_25 = None + linear_49 = torch._C._nn.linear( + hidden_states_40, + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_26 = linear_49.view((1, 2, -1, 64)) + linear_49 = None + key_states_16 = view_26.transpose(1, 2) + view_26 = None + linear_50 = torch._C._nn.linear( + hidden_states_40, + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_27 = linear_50.view((1, 2, -1, 64)) + linear_50 = None + value_states_8 = view_27.transpose(1, 2) + view_27 = None + query_rot_8 = query_states_16[(Ellipsis, slice(None, 32, None))] + query_pass_8 = query_states_16[(Ellipsis, slice(32, None, None))] + query_states_16 = None + key_rot_8 = key_states_16[(Ellipsis, slice(None, 32, None))] + key_pass_8 = key_states_16[(Ellipsis, slice(32, None, None))] + key_states_16 = None + cos_11 = cos_2.unsqueeze(1) + sin_11 = sin_2.unsqueeze(1) + mul_67 = query_rot_8 * cos_11 + x1_16 = query_rot_8[(Ellipsis, slice(None, 16, None))] + x2_16 = query_rot_8[(Ellipsis, slice(16, None, None))] + query_rot_8 = None + neg_16 = -x2_16 + x2_16 = None + cat_33 = torch.cat((neg_16, x1_16), dim=-1) + neg_16 = x1_16 = None + mul_68 = cat_33 * sin_11 + cat_33 = None + q_embed_8 = mul_67 + mul_68 + mul_67 = mul_68 = None + mul_69 = key_rot_8 * cos_11 + cos_11 = None + x1_17 = key_rot_8[(Ellipsis, slice(None, 16, None))] + x2_17 = key_rot_8[(Ellipsis, slice(16, None, None))] + key_rot_8 = None + neg_17 = -x2_17 + x2_17 = None + cat_34 = torch.cat((neg_17, x1_17), dim=-1) + neg_17 = x1_17 = None + mul_70 = cat_34 * sin_11 + cat_34 = sin_11 = None + k_embed_8 = mul_69 + mul_70 + mul_69 = mul_70 = None + query_states_17 = torch.cat((q_embed_8, query_pass_8), dim=-1) + q_embed_8 = query_pass_8 = None + key_states_17 = torch.cat((k_embed_8, key_pass_8), dim=-1) + k_embed_8 = key_pass_8 = None + attention_mask_9 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_8 = query_states_17.contiguous() + query_states_17 = None + key_8 = key_states_17.contiguous() + value_8 = value_states_8.contiguous() + attn_output_32 = torch._C._nn.scaled_dot_product_attention( + query_8, + key_8, + value_8, + attn_mask=attention_mask_9, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_8 = key_8 = value_8 = attention_mask_9 = None + transpose_36 = attn_output_32.transpose(1, 2) + attn_output_32 = None + attn_output_33 = transpose_36.contiguous() + transpose_36 = None + reshape_8 = attn_output_33.reshape(1, 2, -1) + attn_output_33 = None + attn_output_34 = reshape_8.contiguous() + reshape_8 = None + attn_output_35 = torch._C._nn.linear( + attn_output_34, + l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_34 = l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_8 = torch.nn.functional.dropout(attn_output_35, 0.0, False, False) + attn_output_35 = None + hidden_states_41 = torch._C._nn.linear( + hidden_states_40, + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_40 = ( + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_71 = 0.5 * hidden_states_41 + pow_9 = torch.pow(hidden_states_41, 3.0) + mul_72 = 0.044715 * pow_9 + pow_9 = None + add_50 = hidden_states_41 + mul_72 + hidden_states_41 = mul_72 = None + mul_73 = 0.7978845608028654 * add_50 + add_50 = None + tanh_8 = torch.tanh(mul_73) + mul_73 = None + add_51 = 1.0 + tanh_8 + tanh_8 = None + hidden_states_42 = mul_71 * add_51 + mul_71 = add_51 = None + hidden_states_43 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_42 = ( + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_8 = torch.nn.functional.dropout( + hidden_states_43, 0.0, False, False + ) + hidden_states_43 = None + add_52 = attn_outputs_8 + feed_forward_hidden_states_8 + attn_outputs_8 = feed_forward_hidden_states_8 = None + hidden_states_44 = add_52 + hidden_states_39 + add_52 = hidden_states_39 = None + hidden_states_45 = torch.nn.functional.layer_norm( + hidden_states_44, + (2048,), + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_ + ) = None + linear_54 = torch._C._nn.linear( + hidden_states_45, + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_28 = linear_54.view((1, 2, -1, 64)) + linear_54 = None + query_states_18 = view_28.transpose(1, 2) + view_28 = None + linear_55 = torch._C._nn.linear( + hidden_states_45, + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_29 = linear_55.view((1, 2, -1, 64)) + linear_55 = None + key_states_18 = view_29.transpose(1, 2) + view_29 = None + linear_56 = torch._C._nn.linear( + hidden_states_45, + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_30 = linear_56.view((1, 2, -1, 64)) + linear_56 = None + value_states_9 = view_30.transpose(1, 2) + view_30 = None + query_rot_9 = query_states_18[(Ellipsis, slice(None, 32, None))] + query_pass_9 = query_states_18[(Ellipsis, slice(32, None, None))] + query_states_18 = None + key_rot_9 = key_states_18[(Ellipsis, slice(None, 32, None))] + key_pass_9 = key_states_18[(Ellipsis, slice(32, None, None))] + key_states_18 = None + cos_12 = cos_2.unsqueeze(1) + sin_12 = sin_2.unsqueeze(1) + mul_75 = query_rot_9 * cos_12 + x1_18 = query_rot_9[(Ellipsis, slice(None, 16, None))] + x2_18 = query_rot_9[(Ellipsis, slice(16, None, None))] + query_rot_9 = None + neg_18 = -x2_18 + x2_18 = None + cat_37 = torch.cat((neg_18, x1_18), dim=-1) + neg_18 = x1_18 = None + mul_76 = cat_37 * sin_12 + cat_37 = None + q_embed_9 = mul_75 + mul_76 + mul_75 = mul_76 = None + mul_77 = key_rot_9 * cos_12 + cos_12 = None + x1_19 = key_rot_9[(Ellipsis, slice(None, 16, None))] + x2_19 = key_rot_9[(Ellipsis, slice(16, None, None))] + key_rot_9 = None + neg_19 = -x2_19 + x2_19 = None + cat_38 = torch.cat((neg_19, x1_19), dim=-1) + neg_19 = x1_19 = None + mul_78 = cat_38 * sin_12 + cat_38 = sin_12 = None + k_embed_9 = mul_77 + mul_78 + mul_77 = mul_78 = None + query_states_19 = torch.cat((q_embed_9, query_pass_9), dim=-1) + q_embed_9 = query_pass_9 = None + key_states_19 = torch.cat((k_embed_9, key_pass_9), dim=-1) + k_embed_9 = key_pass_9 = None + attention_mask_10 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_9 = query_states_19.contiguous() + query_states_19 = None + key_9 = key_states_19.contiguous() + value_9 = value_states_9.contiguous() + attn_output_36 = torch._C._nn.scaled_dot_product_attention( + query_9, + key_9, + value_9, + attn_mask=attention_mask_10, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_9 = key_9 = value_9 = attention_mask_10 = None + transpose_40 = attn_output_36.transpose(1, 2) + attn_output_36 = None + attn_output_37 = transpose_40.contiguous() + transpose_40 = None + reshape_9 = attn_output_37.reshape(1, 2, -1) + attn_output_37 = None + attn_output_38 = reshape_9.contiguous() + reshape_9 = None + attn_output_39 = torch._C._nn.linear( + attn_output_38, + l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_38 = l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_9 = torch.nn.functional.dropout(attn_output_39, 0.0, False, False) + attn_output_39 = None + hidden_states_46 = torch._C._nn.linear( + hidden_states_45, + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_45 = ( + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_79 = 0.5 * hidden_states_46 + pow_10 = torch.pow(hidden_states_46, 3.0) + mul_80 = 0.044715 * pow_10 + pow_10 = None + add_56 = hidden_states_46 + mul_80 + hidden_states_46 = mul_80 = None + mul_81 = 0.7978845608028654 * add_56 + add_56 = None + tanh_9 = torch.tanh(mul_81) + mul_81 = None + add_57 = 1.0 + tanh_9 + tanh_9 = None + hidden_states_47 = mul_79 * add_57 + mul_79 = add_57 = None + hidden_states_48 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_47 = ( + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_9 = torch.nn.functional.dropout( + hidden_states_48, 0.0, False, False + ) + hidden_states_48 = None + add_58 = attn_outputs_9 + feed_forward_hidden_states_9 + attn_outputs_9 = feed_forward_hidden_states_9 = None + hidden_states_49 = add_58 + hidden_states_44 + add_58 = hidden_states_44 = None + hidden_states_50 = torch.nn.functional.layer_norm( + hidden_states_49, + (2048,), + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_ + ) = None + linear_60 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_31 = linear_60.view((1, 2, -1, 64)) + linear_60 = None + query_states_20 = view_31.transpose(1, 2) + view_31 = None + linear_61 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_32 = linear_61.view((1, 2, -1, 64)) + linear_61 = None + key_states_20 = view_32.transpose(1, 2) + view_32 = None + linear_62 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_33 = linear_62.view((1, 2, -1, 64)) + linear_62 = None + value_states_10 = view_33.transpose(1, 2) + view_33 = None + query_rot_10 = query_states_20[(Ellipsis, slice(None, 32, None))] + query_pass_10 = query_states_20[(Ellipsis, slice(32, None, None))] + query_states_20 = None + key_rot_10 = key_states_20[(Ellipsis, slice(None, 32, None))] + key_pass_10 = key_states_20[(Ellipsis, slice(32, None, None))] + key_states_20 = None + cos_13 = cos_2.unsqueeze(1) + sin_13 = sin_2.unsqueeze(1) + mul_83 = query_rot_10 * cos_13 + x1_20 = query_rot_10[(Ellipsis, slice(None, 16, None))] + x2_20 = query_rot_10[(Ellipsis, slice(16, None, None))] + query_rot_10 = None + neg_20 = -x2_20 + x2_20 = None + cat_41 = torch.cat((neg_20, x1_20), dim=-1) + neg_20 = x1_20 = None + mul_84 = cat_41 * sin_13 + cat_41 = None + q_embed_10 = mul_83 + mul_84 + mul_83 = mul_84 = None + mul_85 = key_rot_10 * cos_13 + cos_13 = None + x1_21 = key_rot_10[(Ellipsis, slice(None, 16, None))] + x2_21 = key_rot_10[(Ellipsis, slice(16, None, None))] + key_rot_10 = None + neg_21 = -x2_21 + x2_21 = None + cat_42 = torch.cat((neg_21, x1_21), dim=-1) + neg_21 = x1_21 = None + mul_86 = cat_42 * sin_13 + cat_42 = sin_13 = None + k_embed_10 = mul_85 + mul_86 + mul_85 = mul_86 = None + query_states_21 = torch.cat((q_embed_10, query_pass_10), dim=-1) + q_embed_10 = query_pass_10 = None + key_states_21 = torch.cat((k_embed_10, key_pass_10), dim=-1) + k_embed_10 = key_pass_10 = None + attention_mask_11 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_10 = query_states_21.contiguous() + query_states_21 = None + key_10 = key_states_21.contiguous() + value_10 = value_states_10.contiguous() + attn_output_40 = torch._C._nn.scaled_dot_product_attention( + query_10, + key_10, + value_10, + attn_mask=attention_mask_11, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_10 = key_10 = value_10 = attention_mask_11 = None + transpose_44 = attn_output_40.transpose(1, 2) + attn_output_40 = None + attn_output_41 = transpose_44.contiguous() + transpose_44 = None + reshape_10 = attn_output_41.reshape(1, 2, -1) + attn_output_41 = None + attn_output_42 = reshape_10.contiguous() + reshape_10 = None + attn_output_43 = torch._C._nn.linear( + attn_output_42, + l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_42 = l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_10 = torch.nn.functional.dropout(attn_output_43, 0.0, False, False) + attn_output_43 = None + hidden_states_51 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_50 = ( + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_87 = 0.5 * hidden_states_51 + pow_11 = torch.pow(hidden_states_51, 3.0) + mul_88 = 0.044715 * pow_11 + pow_11 = None + add_62 = hidden_states_51 + mul_88 + hidden_states_51 = mul_88 = None + mul_89 = 0.7978845608028654 * add_62 + add_62 = None + tanh_10 = torch.tanh(mul_89) + mul_89 = None + add_63 = 1.0 + tanh_10 + tanh_10 = None + hidden_states_52 = mul_87 * add_63 + mul_87 = add_63 = None + hidden_states_53 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_52 = ( + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_10 = torch.nn.functional.dropout( + hidden_states_53, 0.0, False, False + ) + hidden_states_53 = None + add_64 = attn_outputs_10 + feed_forward_hidden_states_10 + attn_outputs_10 = feed_forward_hidden_states_10 = None + hidden_states_54 = add_64 + hidden_states_49 + add_64 = hidden_states_49 = None + hidden_states_55 = torch.nn.functional.layer_norm( + hidden_states_54, + (2048,), + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_ + ) = None + linear_66 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_34 = linear_66.view((1, 2, -1, 64)) + linear_66 = None + query_states_22 = view_34.transpose(1, 2) + view_34 = None + linear_67 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_35 = linear_67.view((1, 2, -1, 64)) + linear_67 = None + key_states_22 = view_35.transpose(1, 2) + view_35 = None + linear_68 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_36 = linear_68.view((1, 2, -1, 64)) + linear_68 = None + value_states_11 = view_36.transpose(1, 2) + view_36 = None + query_rot_11 = query_states_22[(Ellipsis, slice(None, 32, None))] + query_pass_11 = query_states_22[(Ellipsis, slice(32, None, None))] + query_states_22 = None + key_rot_11 = key_states_22[(Ellipsis, slice(None, 32, None))] + key_pass_11 = key_states_22[(Ellipsis, slice(32, None, None))] + key_states_22 = None + cos_14 = cos_2.unsqueeze(1) + sin_14 = sin_2.unsqueeze(1) + mul_91 = query_rot_11 * cos_14 + x1_22 = query_rot_11[(Ellipsis, slice(None, 16, None))] + x2_22 = query_rot_11[(Ellipsis, slice(16, None, None))] + query_rot_11 = None + neg_22 = -x2_22 + x2_22 = None + cat_45 = torch.cat((neg_22, x1_22), dim=-1) + neg_22 = x1_22 = None + mul_92 = cat_45 * sin_14 + cat_45 = None + q_embed_11 = mul_91 + mul_92 + mul_91 = mul_92 = None + mul_93 = key_rot_11 * cos_14 + cos_14 = None + x1_23 = key_rot_11[(Ellipsis, slice(None, 16, None))] + x2_23 = key_rot_11[(Ellipsis, slice(16, None, None))] + key_rot_11 = None + neg_23 = -x2_23 + x2_23 = None + cat_46 = torch.cat((neg_23, x1_23), dim=-1) + neg_23 = x1_23 = None + mul_94 = cat_46 * sin_14 + cat_46 = sin_14 = None + k_embed_11 = mul_93 + mul_94 + mul_93 = mul_94 = None + query_states_23 = torch.cat((q_embed_11, query_pass_11), dim=-1) + q_embed_11 = query_pass_11 = None + key_states_23 = torch.cat((k_embed_11, key_pass_11), dim=-1) + k_embed_11 = key_pass_11 = None + attention_mask_12 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_11 = query_states_23.contiguous() + query_states_23 = None + key_11 = key_states_23.contiguous() + value_11 = value_states_11.contiguous() + attn_output_44 = torch._C._nn.scaled_dot_product_attention( + query_11, + key_11, + value_11, + attn_mask=attention_mask_12, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_11 = key_11 = value_11 = attention_mask_12 = None + transpose_48 = attn_output_44.transpose(1, 2) + attn_output_44 = None + attn_output_45 = transpose_48.contiguous() + transpose_48 = None + reshape_11 = attn_output_45.reshape(1, 2, -1) + attn_output_45 = None + attn_output_46 = reshape_11.contiguous() + reshape_11 = None + attn_output_47 = torch._C._nn.linear( + attn_output_46, + l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_46 = l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_11 = torch.nn.functional.dropout(attn_output_47, 0.0, False, False) + attn_output_47 = None + hidden_states_56 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_55 = ( + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_95 = 0.5 * hidden_states_56 + pow_12 = torch.pow(hidden_states_56, 3.0) + mul_96 = 0.044715 * pow_12 + pow_12 = None + add_68 = hidden_states_56 + mul_96 + hidden_states_56 = mul_96 = None + mul_97 = 0.7978845608028654 * add_68 + add_68 = None + tanh_11 = torch.tanh(mul_97) + mul_97 = None + add_69 = 1.0 + tanh_11 + tanh_11 = None + hidden_states_57 = mul_95 * add_69 + mul_95 = add_69 = None + hidden_states_58 = torch._C._nn.linear( + hidden_states_57, + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_57 = ( + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_11 = torch.nn.functional.dropout( + hidden_states_58, 0.0, False, False + ) + hidden_states_58 = None + add_70 = attn_outputs_11 + feed_forward_hidden_states_11 + attn_outputs_11 = feed_forward_hidden_states_11 = None + hidden_states_59 = add_70 + hidden_states_54 + add_70 = hidden_states_54 = None + hidden_states_60 = torch.nn.functional.layer_norm( + hidden_states_59, + (2048,), + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_ + ) = None + linear_72 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_37 = linear_72.view((1, 2, -1, 64)) + linear_72 = None + query_states_24 = view_37.transpose(1, 2) + view_37 = None + linear_73 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_38 = linear_73.view((1, 2, -1, 64)) + linear_73 = None + key_states_24 = view_38.transpose(1, 2) + view_38 = None + linear_74 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_39 = linear_74.view((1, 2, -1, 64)) + linear_74 = None + value_states_12 = view_39.transpose(1, 2) + view_39 = None + query_rot_12 = query_states_24[(Ellipsis, slice(None, 32, None))] + query_pass_12 = query_states_24[(Ellipsis, slice(32, None, None))] + query_states_24 = None + key_rot_12 = key_states_24[(Ellipsis, slice(None, 32, None))] + key_pass_12 = key_states_24[(Ellipsis, slice(32, None, None))] + key_states_24 = None + cos_15 = cos_2.unsqueeze(1) + sin_15 = sin_2.unsqueeze(1) + mul_99 = query_rot_12 * cos_15 + x1_24 = query_rot_12[(Ellipsis, slice(None, 16, None))] + x2_24 = query_rot_12[(Ellipsis, slice(16, None, None))] + query_rot_12 = None + neg_24 = -x2_24 + x2_24 = None + cat_49 = torch.cat((neg_24, x1_24), dim=-1) + neg_24 = x1_24 = None + mul_100 = cat_49 * sin_15 + cat_49 = None + q_embed_12 = mul_99 + mul_100 + mul_99 = mul_100 = None + mul_101 = key_rot_12 * cos_15 + cos_15 = None + x1_25 = key_rot_12[(Ellipsis, slice(None, 16, None))] + x2_25 = key_rot_12[(Ellipsis, slice(16, None, None))] + key_rot_12 = None + neg_25 = -x2_25 + x2_25 = None + cat_50 = torch.cat((neg_25, x1_25), dim=-1) + neg_25 = x1_25 = None + mul_102 = cat_50 * sin_15 + cat_50 = sin_15 = None + k_embed_12 = mul_101 + mul_102 + mul_101 = mul_102 = None + query_states_25 = torch.cat((q_embed_12, query_pass_12), dim=-1) + q_embed_12 = query_pass_12 = None + key_states_25 = torch.cat((k_embed_12, key_pass_12), dim=-1) + k_embed_12 = key_pass_12 = None + attention_mask_13 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_12 = query_states_25.contiguous() + query_states_25 = None + key_12 = key_states_25.contiguous() + value_12 = value_states_12.contiguous() + attn_output_48 = torch._C._nn.scaled_dot_product_attention( + query_12, + key_12, + value_12, + attn_mask=attention_mask_13, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_12 = key_12 = value_12 = attention_mask_13 = None + transpose_52 = attn_output_48.transpose(1, 2) + attn_output_48 = None + attn_output_49 = transpose_52.contiguous() + transpose_52 = None + reshape_12 = attn_output_49.reshape(1, 2, -1) + attn_output_49 = None + attn_output_50 = reshape_12.contiguous() + reshape_12 = None + attn_output_51 = torch._C._nn.linear( + attn_output_50, + l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_50 = l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_12 = torch.nn.functional.dropout(attn_output_51, 0.0, False, False) + attn_output_51 = None + hidden_states_61 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_60 = ( + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_103 = 0.5 * hidden_states_61 + pow_13 = torch.pow(hidden_states_61, 3.0) + mul_104 = 0.044715 * pow_13 + pow_13 = None + add_74 = hidden_states_61 + mul_104 + hidden_states_61 = mul_104 = None + mul_105 = 0.7978845608028654 * add_74 + add_74 = None + tanh_12 = torch.tanh(mul_105) + mul_105 = None + add_75 = 1.0 + tanh_12 + tanh_12 = None + hidden_states_62 = mul_103 * add_75 + mul_103 = add_75 = None + hidden_states_63 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_62 = ( + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_12 = torch.nn.functional.dropout( + hidden_states_63, 0.0, False, False + ) + hidden_states_63 = None + add_76 = attn_outputs_12 + feed_forward_hidden_states_12 + attn_outputs_12 = feed_forward_hidden_states_12 = None + hidden_states_64 = add_76 + hidden_states_59 + add_76 = hidden_states_59 = None + hidden_states_65 = torch.nn.functional.layer_norm( + hidden_states_64, + (2048,), + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_ + ) = None + linear_78 = torch._C._nn.linear( + hidden_states_65, + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_40 = linear_78.view((1, 2, -1, 64)) + linear_78 = None + query_states_26 = view_40.transpose(1, 2) + view_40 = None + linear_79 = torch._C._nn.linear( + hidden_states_65, + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_41 = linear_79.view((1, 2, -1, 64)) + linear_79 = None + key_states_26 = view_41.transpose(1, 2) + view_41 = None + linear_80 = torch._C._nn.linear( + hidden_states_65, + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_42 = linear_80.view((1, 2, -1, 64)) + linear_80 = None + value_states_13 = view_42.transpose(1, 2) + view_42 = None + query_rot_13 = query_states_26[(Ellipsis, slice(None, 32, None))] + query_pass_13 = query_states_26[(Ellipsis, slice(32, None, None))] + query_states_26 = None + key_rot_13 = key_states_26[(Ellipsis, slice(None, 32, None))] + key_pass_13 = key_states_26[(Ellipsis, slice(32, None, None))] + key_states_26 = None + cos_16 = cos_2.unsqueeze(1) + sin_16 = sin_2.unsqueeze(1) + mul_107 = query_rot_13 * cos_16 + x1_26 = query_rot_13[(Ellipsis, slice(None, 16, None))] + x2_26 = query_rot_13[(Ellipsis, slice(16, None, None))] + query_rot_13 = None + neg_26 = -x2_26 + x2_26 = None + cat_53 = torch.cat((neg_26, x1_26), dim=-1) + neg_26 = x1_26 = None + mul_108 = cat_53 * sin_16 + cat_53 = None + q_embed_13 = mul_107 + mul_108 + mul_107 = mul_108 = None + mul_109 = key_rot_13 * cos_16 + cos_16 = None + x1_27 = key_rot_13[(Ellipsis, slice(None, 16, None))] + x2_27 = key_rot_13[(Ellipsis, slice(16, None, None))] + key_rot_13 = None + neg_27 = -x2_27 + x2_27 = None + cat_54 = torch.cat((neg_27, x1_27), dim=-1) + neg_27 = x1_27 = None + mul_110 = cat_54 * sin_16 + cat_54 = sin_16 = None + k_embed_13 = mul_109 + mul_110 + mul_109 = mul_110 = None + query_states_27 = torch.cat((q_embed_13, query_pass_13), dim=-1) + q_embed_13 = query_pass_13 = None + key_states_27 = torch.cat((k_embed_13, key_pass_13), dim=-1) + k_embed_13 = key_pass_13 = None + attention_mask_14 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_13 = query_states_27.contiguous() + query_states_27 = None + key_13 = key_states_27.contiguous() + value_13 = value_states_13.contiguous() + attn_output_52 = torch._C._nn.scaled_dot_product_attention( + query_13, + key_13, + value_13, + attn_mask=attention_mask_14, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_13 = key_13 = value_13 = attention_mask_14 = None + transpose_56 = attn_output_52.transpose(1, 2) + attn_output_52 = None + attn_output_53 = transpose_56.contiguous() + transpose_56 = None + reshape_13 = attn_output_53.reshape(1, 2, -1) + attn_output_53 = None + attn_output_54 = reshape_13.contiguous() + reshape_13 = None + attn_output_55 = torch._C._nn.linear( + attn_output_54, + l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_54 = l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_13 = torch.nn.functional.dropout(attn_output_55, 0.0, False, False) + attn_output_55 = None + hidden_states_66 = torch._C._nn.linear( + hidden_states_65, + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_65 = ( + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_111 = 0.5 * hidden_states_66 + pow_14 = torch.pow(hidden_states_66, 3.0) + mul_112 = 0.044715 * pow_14 + pow_14 = None + add_80 = hidden_states_66 + mul_112 + hidden_states_66 = mul_112 = None + mul_113 = 0.7978845608028654 * add_80 + add_80 = None + tanh_13 = torch.tanh(mul_113) + mul_113 = None + add_81 = 1.0 + tanh_13 + tanh_13 = None + hidden_states_67 = mul_111 * add_81 + mul_111 = add_81 = None + hidden_states_68 = torch._C._nn.linear( + hidden_states_67, + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_67 = ( + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_13 = torch.nn.functional.dropout( + hidden_states_68, 0.0, False, False + ) + hidden_states_68 = None + add_82 = attn_outputs_13 + feed_forward_hidden_states_13 + attn_outputs_13 = feed_forward_hidden_states_13 = None + hidden_states_69 = add_82 + hidden_states_64 + add_82 = hidden_states_64 = None + hidden_states_70 = torch.nn.functional.layer_norm( + hidden_states_69, + (2048,), + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_ + ) = None + linear_84 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_43 = linear_84.view((1, 2, -1, 64)) + linear_84 = None + query_states_28 = view_43.transpose(1, 2) + view_43 = None + linear_85 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_44 = linear_85.view((1, 2, -1, 64)) + linear_85 = None + key_states_28 = view_44.transpose(1, 2) + view_44 = None + linear_86 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_45 = linear_86.view((1, 2, -1, 64)) + linear_86 = None + value_states_14 = view_45.transpose(1, 2) + view_45 = None + query_rot_14 = query_states_28[(Ellipsis, slice(None, 32, None))] + query_pass_14 = query_states_28[(Ellipsis, slice(32, None, None))] + query_states_28 = None + key_rot_14 = key_states_28[(Ellipsis, slice(None, 32, None))] + key_pass_14 = key_states_28[(Ellipsis, slice(32, None, None))] + key_states_28 = None + cos_17 = cos_2.unsqueeze(1) + sin_17 = sin_2.unsqueeze(1) + mul_115 = query_rot_14 * cos_17 + x1_28 = query_rot_14[(Ellipsis, slice(None, 16, None))] + x2_28 = query_rot_14[(Ellipsis, slice(16, None, None))] + query_rot_14 = None + neg_28 = -x2_28 + x2_28 = None + cat_57 = torch.cat((neg_28, x1_28), dim=-1) + neg_28 = x1_28 = None + mul_116 = cat_57 * sin_17 + cat_57 = None + q_embed_14 = mul_115 + mul_116 + mul_115 = mul_116 = None + mul_117 = key_rot_14 * cos_17 + cos_17 = None + x1_29 = key_rot_14[(Ellipsis, slice(None, 16, None))] + x2_29 = key_rot_14[(Ellipsis, slice(16, None, None))] + key_rot_14 = None + neg_29 = -x2_29 + x2_29 = None + cat_58 = torch.cat((neg_29, x1_29), dim=-1) + neg_29 = x1_29 = None + mul_118 = cat_58 * sin_17 + cat_58 = sin_17 = None + k_embed_14 = mul_117 + mul_118 + mul_117 = mul_118 = None + query_states_29 = torch.cat((q_embed_14, query_pass_14), dim=-1) + q_embed_14 = query_pass_14 = None + key_states_29 = torch.cat((k_embed_14, key_pass_14), dim=-1) + k_embed_14 = key_pass_14 = None + attention_mask_15 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_14 = query_states_29.contiguous() + query_states_29 = None + key_14 = key_states_29.contiguous() + value_14 = value_states_14.contiguous() + attn_output_56 = torch._C._nn.scaled_dot_product_attention( + query_14, + key_14, + value_14, + attn_mask=attention_mask_15, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_14 = key_14 = value_14 = attention_mask_15 = None + transpose_60 = attn_output_56.transpose(1, 2) + attn_output_56 = None + attn_output_57 = transpose_60.contiguous() + transpose_60 = None + reshape_14 = attn_output_57.reshape(1, 2, -1) + attn_output_57 = None + attn_output_58 = reshape_14.contiguous() + reshape_14 = None + attn_output_59 = torch._C._nn.linear( + attn_output_58, + l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_58 = l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_14 = torch.nn.functional.dropout(attn_output_59, 0.0, False, False) + attn_output_59 = None + hidden_states_71 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_70 = ( + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_119 = 0.5 * hidden_states_71 + pow_15 = torch.pow(hidden_states_71, 3.0) + mul_120 = 0.044715 * pow_15 + pow_15 = None + add_86 = hidden_states_71 + mul_120 + hidden_states_71 = mul_120 = None + mul_121 = 0.7978845608028654 * add_86 + add_86 = None + tanh_14 = torch.tanh(mul_121) + mul_121 = None + add_87 = 1.0 + tanh_14 + tanh_14 = None + hidden_states_72 = mul_119 * add_87 + mul_119 = add_87 = None + hidden_states_73 = torch._C._nn.linear( + hidden_states_72, + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_72 = ( + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_14 = torch.nn.functional.dropout( + hidden_states_73, 0.0, False, False + ) + hidden_states_73 = None + add_88 = attn_outputs_14 + feed_forward_hidden_states_14 + attn_outputs_14 = feed_forward_hidden_states_14 = None + hidden_states_74 = add_88 + hidden_states_69 + add_88 = hidden_states_69 = None + hidden_states_75 = torch.nn.functional.layer_norm( + hidden_states_74, + (2048,), + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_ + ) = None + linear_90 = torch._C._nn.linear( + hidden_states_75, + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_46 = linear_90.view((1, 2, -1, 64)) + linear_90 = None + query_states_30 = view_46.transpose(1, 2) + view_46 = None + linear_91 = torch._C._nn.linear( + hidden_states_75, + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_47 = linear_91.view((1, 2, -1, 64)) + linear_91 = None + key_states_30 = view_47.transpose(1, 2) + view_47 = None + linear_92 = torch._C._nn.linear( + hidden_states_75, + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_48 = linear_92.view((1, 2, -1, 64)) + linear_92 = None + value_states_15 = view_48.transpose(1, 2) + view_48 = None + query_rot_15 = query_states_30[(Ellipsis, slice(None, 32, None))] + query_pass_15 = query_states_30[(Ellipsis, slice(32, None, None))] + query_states_30 = None + key_rot_15 = key_states_30[(Ellipsis, slice(None, 32, None))] + key_pass_15 = key_states_30[(Ellipsis, slice(32, None, None))] + key_states_30 = None + cos_18 = cos_2.unsqueeze(1) + sin_18 = sin_2.unsqueeze(1) + mul_123 = query_rot_15 * cos_18 + x1_30 = query_rot_15[(Ellipsis, slice(None, 16, None))] + x2_30 = query_rot_15[(Ellipsis, slice(16, None, None))] + query_rot_15 = None + neg_30 = -x2_30 + x2_30 = None + cat_61 = torch.cat((neg_30, x1_30), dim=-1) + neg_30 = x1_30 = None + mul_124 = cat_61 * sin_18 + cat_61 = None + q_embed_15 = mul_123 + mul_124 + mul_123 = mul_124 = None + mul_125 = key_rot_15 * cos_18 + cos_18 = None + x1_31 = key_rot_15[(Ellipsis, slice(None, 16, None))] + x2_31 = key_rot_15[(Ellipsis, slice(16, None, None))] + key_rot_15 = None + neg_31 = -x2_31 + x2_31 = None + cat_62 = torch.cat((neg_31, x1_31), dim=-1) + neg_31 = x1_31 = None + mul_126 = cat_62 * sin_18 + cat_62 = sin_18 = None + k_embed_15 = mul_125 + mul_126 + mul_125 = mul_126 = None + query_states_31 = torch.cat((q_embed_15, query_pass_15), dim=-1) + q_embed_15 = query_pass_15 = None + key_states_31 = torch.cat((k_embed_15, key_pass_15), dim=-1) + k_embed_15 = key_pass_15 = None + attention_mask_16 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_15 = query_states_31.contiguous() + query_states_31 = None + key_15 = key_states_31.contiguous() + value_15 = value_states_15.contiguous() + attn_output_60 = torch._C._nn.scaled_dot_product_attention( + query_15, + key_15, + value_15, + attn_mask=attention_mask_16, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_15 = key_15 = value_15 = attention_mask_16 = None + transpose_64 = attn_output_60.transpose(1, 2) + attn_output_60 = None + attn_output_61 = transpose_64.contiguous() + transpose_64 = None + reshape_15 = attn_output_61.reshape(1, 2, -1) + attn_output_61 = None + attn_output_62 = reshape_15.contiguous() + reshape_15 = None + attn_output_63 = torch._C._nn.linear( + attn_output_62, + l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_62 = l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_15 = torch.nn.functional.dropout(attn_output_63, 0.0, False, False) + attn_output_63 = None + hidden_states_76 = torch._C._nn.linear( + hidden_states_75, + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_75 = ( + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_127 = 0.5 * hidden_states_76 + pow_16 = torch.pow(hidden_states_76, 3.0) + mul_128 = 0.044715 * pow_16 + pow_16 = None + add_92 = hidden_states_76 + mul_128 + hidden_states_76 = mul_128 = None + mul_129 = 0.7978845608028654 * add_92 + add_92 = None + tanh_15 = torch.tanh(mul_129) + mul_129 = None + add_93 = 1.0 + tanh_15 + tanh_15 = None + hidden_states_77 = mul_127 * add_93 + mul_127 = add_93 = None + hidden_states_78 = torch._C._nn.linear( + hidden_states_77, + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_77 = ( + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_15 = torch.nn.functional.dropout( + hidden_states_78, 0.0, False, False + ) + hidden_states_78 = None + add_94 = attn_outputs_15 + feed_forward_hidden_states_15 + attn_outputs_15 = feed_forward_hidden_states_15 = None + hidden_states_79 = add_94 + hidden_states_74 + add_94 = hidden_states_74 = None + hidden_states_80 = torch.nn.functional.layer_norm( + hidden_states_79, + (2048,), + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_ + ) = None + linear_96 = torch._C._nn.linear( + hidden_states_80, + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_49 = linear_96.view((1, 2, -1, 64)) + linear_96 = None + query_states_32 = view_49.transpose(1, 2) + view_49 = None + linear_97 = torch._C._nn.linear( + hidden_states_80, + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_50 = linear_97.view((1, 2, -1, 64)) + linear_97 = None + key_states_32 = view_50.transpose(1, 2) + view_50 = None + linear_98 = torch._C._nn.linear( + hidden_states_80, + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_51 = linear_98.view((1, 2, -1, 64)) + linear_98 = None + value_states_16 = view_51.transpose(1, 2) + view_51 = None + query_rot_16 = query_states_32[(Ellipsis, slice(None, 32, None))] + query_pass_16 = query_states_32[(Ellipsis, slice(32, None, None))] + query_states_32 = None + key_rot_16 = key_states_32[(Ellipsis, slice(None, 32, None))] + key_pass_16 = key_states_32[(Ellipsis, slice(32, None, None))] + key_states_32 = None + cos_19 = cos_2.unsqueeze(1) + sin_19 = sin_2.unsqueeze(1) + mul_131 = query_rot_16 * cos_19 + x1_32 = query_rot_16[(Ellipsis, slice(None, 16, None))] + x2_32 = query_rot_16[(Ellipsis, slice(16, None, None))] + query_rot_16 = None + neg_32 = -x2_32 + x2_32 = None + cat_65 = torch.cat((neg_32, x1_32), dim=-1) + neg_32 = x1_32 = None + mul_132 = cat_65 * sin_19 + cat_65 = None + q_embed_16 = mul_131 + mul_132 + mul_131 = mul_132 = None + mul_133 = key_rot_16 * cos_19 + cos_19 = None + x1_33 = key_rot_16[(Ellipsis, slice(None, 16, None))] + x2_33 = key_rot_16[(Ellipsis, slice(16, None, None))] + key_rot_16 = None + neg_33 = -x2_33 + x2_33 = None + cat_66 = torch.cat((neg_33, x1_33), dim=-1) + neg_33 = x1_33 = None + mul_134 = cat_66 * sin_19 + cat_66 = sin_19 = None + k_embed_16 = mul_133 + mul_134 + mul_133 = mul_134 = None + query_states_33 = torch.cat((q_embed_16, query_pass_16), dim=-1) + q_embed_16 = query_pass_16 = None + key_states_33 = torch.cat((k_embed_16, key_pass_16), dim=-1) + k_embed_16 = key_pass_16 = None + attention_mask_17 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_16 = query_states_33.contiguous() + query_states_33 = None + key_16 = key_states_33.contiguous() + value_16 = value_states_16.contiguous() + attn_output_64 = torch._C._nn.scaled_dot_product_attention( + query_16, + key_16, + value_16, + attn_mask=attention_mask_17, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_16 = key_16 = value_16 = attention_mask_17 = None + transpose_68 = attn_output_64.transpose(1, 2) + attn_output_64 = None + attn_output_65 = transpose_68.contiguous() + transpose_68 = None + reshape_16 = attn_output_65.reshape(1, 2, -1) + attn_output_65 = None + attn_output_66 = reshape_16.contiguous() + reshape_16 = None + attn_output_67 = torch._C._nn.linear( + attn_output_66, + l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_66 = l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_16 = torch.nn.functional.dropout(attn_output_67, 0.0, False, False) + attn_output_67 = None + hidden_states_81 = torch._C._nn.linear( + hidden_states_80, + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_80 = ( + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_135 = 0.5 * hidden_states_81 + pow_17 = torch.pow(hidden_states_81, 3.0) + mul_136 = 0.044715 * pow_17 + pow_17 = None + add_98 = hidden_states_81 + mul_136 + hidden_states_81 = mul_136 = None + mul_137 = 0.7978845608028654 * add_98 + add_98 = None + tanh_16 = torch.tanh(mul_137) + mul_137 = None + add_99 = 1.0 + tanh_16 + tanh_16 = None + hidden_states_82 = mul_135 * add_99 + mul_135 = add_99 = None + hidden_states_83 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_82 = ( + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_16 = torch.nn.functional.dropout( + hidden_states_83, 0.0, False, False + ) + hidden_states_83 = None + add_100 = attn_outputs_16 + feed_forward_hidden_states_16 + attn_outputs_16 = feed_forward_hidden_states_16 = None + hidden_states_84 = add_100 + hidden_states_79 + add_100 = hidden_states_79 = None + hidden_states_85 = torch.nn.functional.layer_norm( + hidden_states_84, + (2048,), + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_ + ) = None + linear_102 = torch._C._nn.linear( + hidden_states_85, + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_52 = linear_102.view((1, 2, -1, 64)) + linear_102 = None + query_states_34 = view_52.transpose(1, 2) + view_52 = None + linear_103 = torch._C._nn.linear( + hidden_states_85, + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_53 = linear_103.view((1, 2, -1, 64)) + linear_103 = None + key_states_34 = view_53.transpose(1, 2) + view_53 = None + linear_104 = torch._C._nn.linear( + hidden_states_85, + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_54 = linear_104.view((1, 2, -1, 64)) + linear_104 = None + value_states_17 = view_54.transpose(1, 2) + view_54 = None + query_rot_17 = query_states_34[(Ellipsis, slice(None, 32, None))] + query_pass_17 = query_states_34[(Ellipsis, slice(32, None, None))] + query_states_34 = None + key_rot_17 = key_states_34[(Ellipsis, slice(None, 32, None))] + key_pass_17 = key_states_34[(Ellipsis, slice(32, None, None))] + key_states_34 = None + cos_20 = cos_2.unsqueeze(1) + sin_20 = sin_2.unsqueeze(1) + mul_139 = query_rot_17 * cos_20 + x1_34 = query_rot_17[(Ellipsis, slice(None, 16, None))] + x2_34 = query_rot_17[(Ellipsis, slice(16, None, None))] + query_rot_17 = None + neg_34 = -x2_34 + x2_34 = None + cat_69 = torch.cat((neg_34, x1_34), dim=-1) + neg_34 = x1_34 = None + mul_140 = cat_69 * sin_20 + cat_69 = None + q_embed_17 = mul_139 + mul_140 + mul_139 = mul_140 = None + mul_141 = key_rot_17 * cos_20 + cos_20 = None + x1_35 = key_rot_17[(Ellipsis, slice(None, 16, None))] + x2_35 = key_rot_17[(Ellipsis, slice(16, None, None))] + key_rot_17 = None + neg_35 = -x2_35 + x2_35 = None + cat_70 = torch.cat((neg_35, x1_35), dim=-1) + neg_35 = x1_35 = None + mul_142 = cat_70 * sin_20 + cat_70 = sin_20 = None + k_embed_17 = mul_141 + mul_142 + mul_141 = mul_142 = None + query_states_35 = torch.cat((q_embed_17, query_pass_17), dim=-1) + q_embed_17 = query_pass_17 = None + key_states_35 = torch.cat((k_embed_17, key_pass_17), dim=-1) + k_embed_17 = key_pass_17 = None + attention_mask_18 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_17 = query_states_35.contiguous() + query_states_35 = None + key_17 = key_states_35.contiguous() + value_17 = value_states_17.contiguous() + attn_output_68 = torch._C._nn.scaled_dot_product_attention( + query_17, + key_17, + value_17, + attn_mask=attention_mask_18, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_17 = key_17 = value_17 = attention_mask_18 = None + transpose_72 = attn_output_68.transpose(1, 2) + attn_output_68 = None + attn_output_69 = transpose_72.contiguous() + transpose_72 = None + reshape_17 = attn_output_69.reshape(1, 2, -1) + attn_output_69 = None + attn_output_70 = reshape_17.contiguous() + reshape_17 = None + attn_output_71 = torch._C._nn.linear( + attn_output_70, + l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_70 = l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_17 = torch.nn.functional.dropout(attn_output_71, 0.0, False, False) + attn_output_71 = None + hidden_states_86 = torch._C._nn.linear( + hidden_states_85, + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_85 = ( + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_143 = 0.5 * hidden_states_86 + pow_18 = torch.pow(hidden_states_86, 3.0) + mul_144 = 0.044715 * pow_18 + pow_18 = None + add_104 = hidden_states_86 + mul_144 + hidden_states_86 = mul_144 = None + mul_145 = 0.7978845608028654 * add_104 + add_104 = None + tanh_17 = torch.tanh(mul_145) + mul_145 = None + add_105 = 1.0 + tanh_17 + tanh_17 = None + hidden_states_87 = mul_143 * add_105 + mul_143 = add_105 = None + hidden_states_88 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_87 = ( + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_17 = torch.nn.functional.dropout( + hidden_states_88, 0.0, False, False + ) + hidden_states_88 = None + add_106 = attn_outputs_17 + feed_forward_hidden_states_17 + attn_outputs_17 = feed_forward_hidden_states_17 = None + hidden_states_89 = add_106 + hidden_states_84 + add_106 = hidden_states_84 = None + hidden_states_90 = torch.nn.functional.layer_norm( + hidden_states_89, + (2048,), + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_ + ) = None + linear_108 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_55 = linear_108.view((1, 2, -1, 64)) + linear_108 = None + query_states_36 = view_55.transpose(1, 2) + view_55 = None + linear_109 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_56 = linear_109.view((1, 2, -1, 64)) + linear_109 = None + key_states_36 = view_56.transpose(1, 2) + view_56 = None + linear_110 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_57 = linear_110.view((1, 2, -1, 64)) + linear_110 = None + value_states_18 = view_57.transpose(1, 2) + view_57 = None + query_rot_18 = query_states_36[(Ellipsis, slice(None, 32, None))] + query_pass_18 = query_states_36[(Ellipsis, slice(32, None, None))] + query_states_36 = None + key_rot_18 = key_states_36[(Ellipsis, slice(None, 32, None))] + key_pass_18 = key_states_36[(Ellipsis, slice(32, None, None))] + key_states_36 = None + cos_21 = cos_2.unsqueeze(1) + sin_21 = sin_2.unsqueeze(1) + mul_147 = query_rot_18 * cos_21 + x1_36 = query_rot_18[(Ellipsis, slice(None, 16, None))] + x2_36 = query_rot_18[(Ellipsis, slice(16, None, None))] + query_rot_18 = None + neg_36 = -x2_36 + x2_36 = None + cat_73 = torch.cat((neg_36, x1_36), dim=-1) + neg_36 = x1_36 = None + mul_148 = cat_73 * sin_21 + cat_73 = None + q_embed_18 = mul_147 + mul_148 + mul_147 = mul_148 = None + mul_149 = key_rot_18 * cos_21 + cos_21 = None + x1_37 = key_rot_18[(Ellipsis, slice(None, 16, None))] + x2_37 = key_rot_18[(Ellipsis, slice(16, None, None))] + key_rot_18 = None + neg_37 = -x2_37 + x2_37 = None + cat_74 = torch.cat((neg_37, x1_37), dim=-1) + neg_37 = x1_37 = None + mul_150 = cat_74 * sin_21 + cat_74 = sin_21 = None + k_embed_18 = mul_149 + mul_150 + mul_149 = mul_150 = None + query_states_37 = torch.cat((q_embed_18, query_pass_18), dim=-1) + q_embed_18 = query_pass_18 = None + key_states_37 = torch.cat((k_embed_18, key_pass_18), dim=-1) + k_embed_18 = key_pass_18 = None + attention_mask_19 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_18 = query_states_37.contiguous() + query_states_37 = None + key_18 = key_states_37.contiguous() + value_18 = value_states_18.contiguous() + attn_output_72 = torch._C._nn.scaled_dot_product_attention( + query_18, + key_18, + value_18, + attn_mask=attention_mask_19, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_18 = key_18 = value_18 = attention_mask_19 = None + transpose_76 = attn_output_72.transpose(1, 2) + attn_output_72 = None + attn_output_73 = transpose_76.contiguous() + transpose_76 = None + reshape_18 = attn_output_73.reshape(1, 2, -1) + attn_output_73 = None + attn_output_74 = reshape_18.contiguous() + reshape_18 = None + attn_output_75 = torch._C._nn.linear( + attn_output_74, + l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_74 = l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_18 = torch.nn.functional.dropout(attn_output_75, 0.0, False, False) + attn_output_75 = None + hidden_states_91 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_90 = ( + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_151 = 0.5 * hidden_states_91 + pow_19 = torch.pow(hidden_states_91, 3.0) + mul_152 = 0.044715 * pow_19 + pow_19 = None + add_110 = hidden_states_91 + mul_152 + hidden_states_91 = mul_152 = None + mul_153 = 0.7978845608028654 * add_110 + add_110 = None + tanh_18 = torch.tanh(mul_153) + mul_153 = None + add_111 = 1.0 + tanh_18 + tanh_18 = None + hidden_states_92 = mul_151 * add_111 + mul_151 = add_111 = None + hidden_states_93 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_92 = ( + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_18 = torch.nn.functional.dropout( + hidden_states_93, 0.0, False, False + ) + hidden_states_93 = None + add_112 = attn_outputs_18 + feed_forward_hidden_states_18 + attn_outputs_18 = feed_forward_hidden_states_18 = None + hidden_states_94 = add_112 + hidden_states_89 + add_112 = hidden_states_89 = None + hidden_states_95 = torch.nn.functional.layer_norm( + hidden_states_94, + (2048,), + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_ + ) = None + linear_114 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_58 = linear_114.view((1, 2, -1, 64)) + linear_114 = None + query_states_38 = view_58.transpose(1, 2) + view_58 = None + linear_115 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_59 = linear_115.view((1, 2, -1, 64)) + linear_115 = None + key_states_38 = view_59.transpose(1, 2) + view_59 = None + linear_116 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_60 = linear_116.view((1, 2, -1, 64)) + linear_116 = None + value_states_19 = view_60.transpose(1, 2) + view_60 = None + query_rot_19 = query_states_38[(Ellipsis, slice(None, 32, None))] + query_pass_19 = query_states_38[(Ellipsis, slice(32, None, None))] + query_states_38 = None + key_rot_19 = key_states_38[(Ellipsis, slice(None, 32, None))] + key_pass_19 = key_states_38[(Ellipsis, slice(32, None, None))] + key_states_38 = None + cos_22 = cos_2.unsqueeze(1) + sin_22 = sin_2.unsqueeze(1) + mul_155 = query_rot_19 * cos_22 + x1_38 = query_rot_19[(Ellipsis, slice(None, 16, None))] + x2_38 = query_rot_19[(Ellipsis, slice(16, None, None))] + query_rot_19 = None + neg_38 = -x2_38 + x2_38 = None + cat_77 = torch.cat((neg_38, x1_38), dim=-1) + neg_38 = x1_38 = None + mul_156 = cat_77 * sin_22 + cat_77 = None + q_embed_19 = mul_155 + mul_156 + mul_155 = mul_156 = None + mul_157 = key_rot_19 * cos_22 + cos_22 = None + x1_39 = key_rot_19[(Ellipsis, slice(None, 16, None))] + x2_39 = key_rot_19[(Ellipsis, slice(16, None, None))] + key_rot_19 = None + neg_39 = -x2_39 + x2_39 = None + cat_78 = torch.cat((neg_39, x1_39), dim=-1) + neg_39 = x1_39 = None + mul_158 = cat_78 * sin_22 + cat_78 = sin_22 = None + k_embed_19 = mul_157 + mul_158 + mul_157 = mul_158 = None + query_states_39 = torch.cat((q_embed_19, query_pass_19), dim=-1) + q_embed_19 = query_pass_19 = None + key_states_39 = torch.cat((k_embed_19, key_pass_19), dim=-1) + k_embed_19 = key_pass_19 = None + attention_mask_20 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_19 = query_states_39.contiguous() + query_states_39 = None + key_19 = key_states_39.contiguous() + value_19 = value_states_19.contiguous() + attn_output_76 = torch._C._nn.scaled_dot_product_attention( + query_19, + key_19, + value_19, + attn_mask=attention_mask_20, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_19 = key_19 = value_19 = attention_mask_20 = None + transpose_80 = attn_output_76.transpose(1, 2) + attn_output_76 = None + attn_output_77 = transpose_80.contiguous() + transpose_80 = None + reshape_19 = attn_output_77.reshape(1, 2, -1) + attn_output_77 = None + attn_output_78 = reshape_19.contiguous() + reshape_19 = None + attn_output_79 = torch._C._nn.linear( + attn_output_78, + l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_78 = l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_19 = torch.nn.functional.dropout(attn_output_79, 0.0, False, False) + attn_output_79 = None + hidden_states_96 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_95 = ( + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_159 = 0.5 * hidden_states_96 + pow_20 = torch.pow(hidden_states_96, 3.0) + mul_160 = 0.044715 * pow_20 + pow_20 = None + add_116 = hidden_states_96 + mul_160 + hidden_states_96 = mul_160 = None + mul_161 = 0.7978845608028654 * add_116 + add_116 = None + tanh_19 = torch.tanh(mul_161) + mul_161 = None + add_117 = 1.0 + tanh_19 + tanh_19 = None + hidden_states_97 = mul_159 * add_117 + mul_159 = add_117 = None + hidden_states_98 = torch._C._nn.linear( + hidden_states_97, + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_97 = ( + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_19 = torch.nn.functional.dropout( + hidden_states_98, 0.0, False, False + ) + hidden_states_98 = None + add_118 = attn_outputs_19 + feed_forward_hidden_states_19 + attn_outputs_19 = feed_forward_hidden_states_19 = None + hidden_states_99 = add_118 + hidden_states_94 + add_118 = hidden_states_94 = None + hidden_states_100 = torch.nn.functional.layer_norm( + hidden_states_99, + (2048,), + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_ + ) = None + linear_120 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_61 = linear_120.view((1, 2, -1, 64)) + linear_120 = None + query_states_40 = view_61.transpose(1, 2) + view_61 = None + linear_121 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_62 = linear_121.view((1, 2, -1, 64)) + linear_121 = None + key_states_40 = view_62.transpose(1, 2) + view_62 = None + linear_122 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_63 = linear_122.view((1, 2, -1, 64)) + linear_122 = None + value_states_20 = view_63.transpose(1, 2) + view_63 = None + query_rot_20 = query_states_40[(Ellipsis, slice(None, 32, None))] + query_pass_20 = query_states_40[(Ellipsis, slice(32, None, None))] + query_states_40 = None + key_rot_20 = key_states_40[(Ellipsis, slice(None, 32, None))] + key_pass_20 = key_states_40[(Ellipsis, slice(32, None, None))] + key_states_40 = None + cos_23 = cos_2.unsqueeze(1) + sin_23 = sin_2.unsqueeze(1) + mul_163 = query_rot_20 * cos_23 + x1_40 = query_rot_20[(Ellipsis, slice(None, 16, None))] + x2_40 = query_rot_20[(Ellipsis, slice(16, None, None))] + query_rot_20 = None + neg_40 = -x2_40 + x2_40 = None + cat_81 = torch.cat((neg_40, x1_40), dim=-1) + neg_40 = x1_40 = None + mul_164 = cat_81 * sin_23 + cat_81 = None + q_embed_20 = mul_163 + mul_164 + mul_163 = mul_164 = None + mul_165 = key_rot_20 * cos_23 + cos_23 = None + x1_41 = key_rot_20[(Ellipsis, slice(None, 16, None))] + x2_41 = key_rot_20[(Ellipsis, slice(16, None, None))] + key_rot_20 = None + neg_41 = -x2_41 + x2_41 = None + cat_82 = torch.cat((neg_41, x1_41), dim=-1) + neg_41 = x1_41 = None + mul_166 = cat_82 * sin_23 + cat_82 = sin_23 = None + k_embed_20 = mul_165 + mul_166 + mul_165 = mul_166 = None + query_states_41 = torch.cat((q_embed_20, query_pass_20), dim=-1) + q_embed_20 = query_pass_20 = None + key_states_41 = torch.cat((k_embed_20, key_pass_20), dim=-1) + k_embed_20 = key_pass_20 = None + attention_mask_21 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_20 = query_states_41.contiguous() + query_states_41 = None + key_20 = key_states_41.contiguous() + value_20 = value_states_20.contiguous() + attn_output_80 = torch._C._nn.scaled_dot_product_attention( + query_20, + key_20, + value_20, + attn_mask=attention_mask_21, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_20 = key_20 = value_20 = attention_mask_21 = None + transpose_84 = attn_output_80.transpose(1, 2) + attn_output_80 = None + attn_output_81 = transpose_84.contiguous() + transpose_84 = None + reshape_20 = attn_output_81.reshape(1, 2, -1) + attn_output_81 = None + attn_output_82 = reshape_20.contiguous() + reshape_20 = None + attn_output_83 = torch._C._nn.linear( + attn_output_82, + l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_82 = l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_20 = torch.nn.functional.dropout(attn_output_83, 0.0, False, False) + attn_output_83 = None + hidden_states_101 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_100 = ( + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_167 = 0.5 * hidden_states_101 + pow_21 = torch.pow(hidden_states_101, 3.0) + mul_168 = 0.044715 * pow_21 + pow_21 = None + add_122 = hidden_states_101 + mul_168 + hidden_states_101 = mul_168 = None + mul_169 = 0.7978845608028654 * add_122 + add_122 = None + tanh_20 = torch.tanh(mul_169) + mul_169 = None + add_123 = 1.0 + tanh_20 + tanh_20 = None + hidden_states_102 = mul_167 * add_123 + mul_167 = add_123 = None + hidden_states_103 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_102 = ( + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_20 = torch.nn.functional.dropout( + hidden_states_103, 0.0, False, False + ) + hidden_states_103 = None + add_124 = attn_outputs_20 + feed_forward_hidden_states_20 + attn_outputs_20 = feed_forward_hidden_states_20 = None + hidden_states_104 = add_124 + hidden_states_99 + add_124 = hidden_states_99 = None + hidden_states_105 = torch.nn.functional.layer_norm( + hidden_states_104, + (2048,), + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_ + ) = None + linear_126 = torch._C._nn.linear( + hidden_states_105, + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_64 = linear_126.view((1, 2, -1, 64)) + linear_126 = None + query_states_42 = view_64.transpose(1, 2) + view_64 = None + linear_127 = torch._C._nn.linear( + hidden_states_105, + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_65 = linear_127.view((1, 2, -1, 64)) + linear_127 = None + key_states_42 = view_65.transpose(1, 2) + view_65 = None + linear_128 = torch._C._nn.linear( + hidden_states_105, + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_66 = linear_128.view((1, 2, -1, 64)) + linear_128 = None + value_states_21 = view_66.transpose(1, 2) + view_66 = None + query_rot_21 = query_states_42[(Ellipsis, slice(None, 32, None))] + query_pass_21 = query_states_42[(Ellipsis, slice(32, None, None))] + query_states_42 = None + key_rot_21 = key_states_42[(Ellipsis, slice(None, 32, None))] + key_pass_21 = key_states_42[(Ellipsis, slice(32, None, None))] + key_states_42 = None + cos_24 = cos_2.unsqueeze(1) + sin_24 = sin_2.unsqueeze(1) + mul_171 = query_rot_21 * cos_24 + x1_42 = query_rot_21[(Ellipsis, slice(None, 16, None))] + x2_42 = query_rot_21[(Ellipsis, slice(16, None, None))] + query_rot_21 = None + neg_42 = -x2_42 + x2_42 = None + cat_85 = torch.cat((neg_42, x1_42), dim=-1) + neg_42 = x1_42 = None + mul_172 = cat_85 * sin_24 + cat_85 = None + q_embed_21 = mul_171 + mul_172 + mul_171 = mul_172 = None + mul_173 = key_rot_21 * cos_24 + cos_24 = None + x1_43 = key_rot_21[(Ellipsis, slice(None, 16, None))] + x2_43 = key_rot_21[(Ellipsis, slice(16, None, None))] + key_rot_21 = None + neg_43 = -x2_43 + x2_43 = None + cat_86 = torch.cat((neg_43, x1_43), dim=-1) + neg_43 = x1_43 = None + mul_174 = cat_86 * sin_24 + cat_86 = sin_24 = None + k_embed_21 = mul_173 + mul_174 + mul_173 = mul_174 = None + query_states_43 = torch.cat((q_embed_21, query_pass_21), dim=-1) + q_embed_21 = query_pass_21 = None + key_states_43 = torch.cat((k_embed_21, key_pass_21), dim=-1) + k_embed_21 = key_pass_21 = None + attention_mask_22 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_21 = query_states_43.contiguous() + query_states_43 = None + key_21 = key_states_43.contiguous() + value_21 = value_states_21.contiguous() + attn_output_84 = torch._C._nn.scaled_dot_product_attention( + query_21, + key_21, + value_21, + attn_mask=attention_mask_22, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_21 = key_21 = value_21 = attention_mask_22 = None + transpose_88 = attn_output_84.transpose(1, 2) + attn_output_84 = None + attn_output_85 = transpose_88.contiguous() + transpose_88 = None + reshape_21 = attn_output_85.reshape(1, 2, -1) + attn_output_85 = None + attn_output_86 = reshape_21.contiguous() + reshape_21 = None + attn_output_87 = torch._C._nn.linear( + attn_output_86, + l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_86 = l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_21 = torch.nn.functional.dropout(attn_output_87, 0.0, False, False) + attn_output_87 = None + hidden_states_106 = torch._C._nn.linear( + hidden_states_105, + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_105 = ( + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_175 = 0.5 * hidden_states_106 + pow_22 = torch.pow(hidden_states_106, 3.0) + mul_176 = 0.044715 * pow_22 + pow_22 = None + add_128 = hidden_states_106 + mul_176 + hidden_states_106 = mul_176 = None + mul_177 = 0.7978845608028654 * add_128 + add_128 = None + tanh_21 = torch.tanh(mul_177) + mul_177 = None + add_129 = 1.0 + tanh_21 + tanh_21 = None + hidden_states_107 = mul_175 * add_129 + mul_175 = add_129 = None + hidden_states_108 = torch._C._nn.linear( + hidden_states_107, + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_107 = ( + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_21 = torch.nn.functional.dropout( + hidden_states_108, 0.0, False, False + ) + hidden_states_108 = None + add_130 = attn_outputs_21 + feed_forward_hidden_states_21 + attn_outputs_21 = feed_forward_hidden_states_21 = None + hidden_states_109 = add_130 + hidden_states_104 + add_130 = hidden_states_104 = None + hidden_states_110 = torch.nn.functional.layer_norm( + hidden_states_109, + (2048,), + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_ + ) = None + linear_132 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_67 = linear_132.view((1, 2, -1, 64)) + linear_132 = None + query_states_44 = view_67.transpose(1, 2) + view_67 = None + linear_133 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_68 = linear_133.view((1, 2, -1, 64)) + linear_133 = None + key_states_44 = view_68.transpose(1, 2) + view_68 = None + linear_134 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_69 = linear_134.view((1, 2, -1, 64)) + linear_134 = None + value_states_22 = view_69.transpose(1, 2) + view_69 = None + query_rot_22 = query_states_44[(Ellipsis, slice(None, 32, None))] + query_pass_22 = query_states_44[(Ellipsis, slice(32, None, None))] + query_states_44 = None + key_rot_22 = key_states_44[(Ellipsis, slice(None, 32, None))] + key_pass_22 = key_states_44[(Ellipsis, slice(32, None, None))] + key_states_44 = None + cos_25 = cos_2.unsqueeze(1) + sin_25 = sin_2.unsqueeze(1) + mul_179 = query_rot_22 * cos_25 + x1_44 = query_rot_22[(Ellipsis, slice(None, 16, None))] + x2_44 = query_rot_22[(Ellipsis, slice(16, None, None))] + query_rot_22 = None + neg_44 = -x2_44 + x2_44 = None + cat_89 = torch.cat((neg_44, x1_44), dim=-1) + neg_44 = x1_44 = None + mul_180 = cat_89 * sin_25 + cat_89 = None + q_embed_22 = mul_179 + mul_180 + mul_179 = mul_180 = None + mul_181 = key_rot_22 * cos_25 + cos_25 = None + x1_45 = key_rot_22[(Ellipsis, slice(None, 16, None))] + x2_45 = key_rot_22[(Ellipsis, slice(16, None, None))] + key_rot_22 = None + neg_45 = -x2_45 + x2_45 = None + cat_90 = torch.cat((neg_45, x1_45), dim=-1) + neg_45 = x1_45 = None + mul_182 = cat_90 * sin_25 + cat_90 = sin_25 = None + k_embed_22 = mul_181 + mul_182 + mul_181 = mul_182 = None + query_states_45 = torch.cat((q_embed_22, query_pass_22), dim=-1) + q_embed_22 = query_pass_22 = None + key_states_45 = torch.cat((k_embed_22, key_pass_22), dim=-1) + k_embed_22 = key_pass_22 = None + attention_mask_23 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_22 = query_states_45.contiguous() + query_states_45 = None + key_22 = key_states_45.contiguous() + value_22 = value_states_22.contiguous() + attn_output_88 = torch._C._nn.scaled_dot_product_attention( + query_22, + key_22, + value_22, + attn_mask=attention_mask_23, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_22 = key_22 = value_22 = attention_mask_23 = None + transpose_92 = attn_output_88.transpose(1, 2) + attn_output_88 = None + attn_output_89 = transpose_92.contiguous() + transpose_92 = None + reshape_22 = attn_output_89.reshape(1, 2, -1) + attn_output_89 = None + attn_output_90 = reshape_22.contiguous() + reshape_22 = None + attn_output_91 = torch._C._nn.linear( + attn_output_90, + l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_90 = l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_22 = torch.nn.functional.dropout(attn_output_91, 0.0, False, False) + attn_output_91 = None + hidden_states_111 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_110 = ( + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_183 = 0.5 * hidden_states_111 + pow_23 = torch.pow(hidden_states_111, 3.0) + mul_184 = 0.044715 * pow_23 + pow_23 = None + add_134 = hidden_states_111 + mul_184 + hidden_states_111 = mul_184 = None + mul_185 = 0.7978845608028654 * add_134 + add_134 = None + tanh_22 = torch.tanh(mul_185) + mul_185 = None + add_135 = 1.0 + tanh_22 + tanh_22 = None + hidden_states_112 = mul_183 * add_135 + mul_183 = add_135 = None + hidden_states_113 = torch._C._nn.linear( + hidden_states_112, + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_112 = ( + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_22 = torch.nn.functional.dropout( + hidden_states_113, 0.0, False, False + ) + hidden_states_113 = None + add_136 = attn_outputs_22 + feed_forward_hidden_states_22 + attn_outputs_22 = feed_forward_hidden_states_22 = None + hidden_states_114 = add_136 + hidden_states_109 + add_136 = hidden_states_109 = None + hidden_states_115 = torch.nn.functional.layer_norm( + hidden_states_114, + (2048,), + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_ + ) = None + linear_138 = torch._C._nn.linear( + hidden_states_115, + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_70 = linear_138.view((1, 2, -1, 64)) + linear_138 = None + query_states_46 = view_70.transpose(1, 2) + view_70 = None + linear_139 = torch._C._nn.linear( + hidden_states_115, + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_71 = linear_139.view((1, 2, -1, 64)) + linear_139 = None + key_states_46 = view_71.transpose(1, 2) + view_71 = None + linear_140 = torch._C._nn.linear( + hidden_states_115, + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_72 = linear_140.view((1, 2, -1, 64)) + linear_140 = None + value_states_23 = view_72.transpose(1, 2) + view_72 = None + query_rot_23 = query_states_46[(Ellipsis, slice(None, 32, None))] + query_pass_23 = query_states_46[(Ellipsis, slice(32, None, None))] + query_states_46 = None + key_rot_23 = key_states_46[(Ellipsis, slice(None, 32, None))] + key_pass_23 = key_states_46[(Ellipsis, slice(32, None, None))] + key_states_46 = None + cos_26 = cos_2.unsqueeze(1) + cos_2 = None + sin_26 = sin_2.unsqueeze(1) + sin_2 = None + mul_187 = query_rot_23 * cos_26 + x1_46 = query_rot_23[(Ellipsis, slice(None, 16, None))] + x2_46 = query_rot_23[(Ellipsis, slice(16, None, None))] + query_rot_23 = None + neg_46 = -x2_46 + x2_46 = None + cat_93 = torch.cat((neg_46, x1_46), dim=-1) + neg_46 = x1_46 = None + mul_188 = cat_93 * sin_26 + cat_93 = None + q_embed_23 = mul_187 + mul_188 + mul_187 = mul_188 = None + mul_189 = key_rot_23 * cos_26 + cos_26 = None + x1_47 = key_rot_23[(Ellipsis, slice(None, 16, None))] + x2_47 = key_rot_23[(Ellipsis, slice(16, None, None))] + key_rot_23 = None + neg_47 = -x2_47 + x2_47 = None + cat_94 = torch.cat((neg_47, x1_47), dim=-1) + neg_47 = x1_47 = None + mul_190 = cat_94 * sin_26 + cat_94 = sin_26 = None + k_embed_23 = mul_189 + mul_190 + mul_189 = mul_190 = None + query_states_47 = torch.cat((q_embed_23, query_pass_23), dim=-1) + q_embed_23 = query_pass_23 = None + key_states_47 = torch.cat((k_embed_23, key_pass_23), dim=-1) + k_embed_23 = key_pass_23 = None + attention_mask_24 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + causal_mask_2 = None + query_23 = query_states_47.contiguous() + query_states_47 = None + key_23 = key_states_47.contiguous() + value_23 = value_states_23.contiguous() + attn_output_92 = torch._C._nn.scaled_dot_product_attention( + query_23, + key_23, + value_23, + attn_mask=attention_mask_24, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_23 = key_23 = value_23 = attention_mask_24 = None + transpose_96 = attn_output_92.transpose(1, 2) + attn_output_92 = None + attn_output_93 = transpose_96.contiguous() + transpose_96 = None + reshape_23 = attn_output_93.reshape(1, 2, -1) + attn_output_93 = None + attn_output_94 = reshape_23.contiguous() + reshape_23 = None + attn_output_95 = torch._C._nn.linear( + attn_output_94, + l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_94 = l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_23 = torch.nn.functional.dropout(attn_output_95, 0.0, False, False) + attn_output_95 = None + hidden_states_116 = torch._C._nn.linear( + hidden_states_115, + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_115 = ( + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_191 = 0.5 * hidden_states_116 + pow_24 = torch.pow(hidden_states_116, 3.0) + mul_192 = 0.044715 * pow_24 + pow_24 = None + add_140 = hidden_states_116 + mul_192 + hidden_states_116 = mul_192 = None + mul_193 = 0.7978845608028654 * add_140 + add_140 = None + tanh_23 = torch.tanh(mul_193) + mul_193 = None + add_141 = 1.0 + tanh_23 + tanh_23 = None + hidden_states_117 = mul_191 * add_141 + mul_191 = add_141 = None + hidden_states_118 = torch._C._nn.linear( + hidden_states_117, + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_117 = ( + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_23 = torch.nn.functional.dropout( + hidden_states_118, 0.0, False, False + ) + hidden_states_118 = None + add_142 = attn_outputs_23 + feed_forward_hidden_states_23 + attn_outputs_23 = feed_forward_hidden_states_23 = None + hidden_states_119 = add_142 + hidden_states_114 + add_142 = hidden_states_114 = None + hidden_states_120 = torch.nn.functional.layer_norm( + hidden_states_119, + (2048,), + l_self_modules_final_layernorm_parameters_weight_, + l_self_modules_final_layernorm_parameters_bias_, + 1e-05, + ) + hidden_states_119 = ( + l_self_modules_final_layernorm_parameters_weight_ + ) = l_self_modules_final_layernorm_parameters_bias_ = None + return ( + value_states, + key_states_1, + value_states_1, + key_states_3, + value_states_2, + key_states_5, + value_states_3, + key_states_7, + value_states_4, + key_states_9, + value_states_5, + key_states_11, + value_states_6, + key_states_13, + value_states_7, + key_states_15, + value_states_8, + key_states_17, + value_states_9, + key_states_19, + value_states_10, + key_states_21, + value_states_11, + key_states_23, + value_states_12, + key_states_25, + value_states_13, + key_states_27, + value_states_14, + key_states_29, + value_states_15, + key_states_31, + value_states_16, + key_states_33, + value_states_17, + key_states_35, + value_states_18, + key_states_37, + value_states_19, + key_states_39, + value_states_20, + key_states_41, + value_states_21, + key_states_43, + value_states_22, + key_states_45, + value_states_23, + key_states_47, + hidden_states_120, + ) diff --git a/samples/transformers-auto-model/microsoft/phi-1_5/weight_meta.py b/samples/transformers-auto-model/microsoft/phi-1_5/weight_meta.py new file mode 100644 index 000000000..f2e66892d --- /dev/null +++ b/samples/transformers-auto-model/microsoft/phi-1_5/weight_meta.py @@ -0,0 +1,3425 @@ +class Program_weight_tensor_meta_L_inputs_embeds_: + name = "L_inputs_embeds_" + shape = [1, 2, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 2] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1] + + +class Program_weight_tensor_meta_L_self_modules_rotary_emb_buffers_inv_freq_: + name = "L_self_modules_rotary_emb_buffers_inv_freq_" + shape = [16] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.143 + std = 0.275 + data = [ + 1.000000, + 0.562341, + 0.316228, + 0.177828, + 0.100000, + 0.056234, + 0.031623, + 0.017783, + 0.010000, + 0.005623, + 0.003162, + 0.001778, + 0.001000, + 0.000562, + 0.000316, + 0.000178, + ] + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_final_layernorm_parameters_weight_: + name = "L_self_modules_final_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_final_layernorm_parameters_bias_: + name = "L_self_modules_final_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/microsoft/phi-2/graph_hash.txt b/samples/transformers-auto-model/microsoft/phi-2/graph_hash.txt new file mode 100644 index 000000000..a8af890cf --- /dev/null +++ b/samples/transformers-auto-model/microsoft/phi-2/graph_hash.txt @@ -0,0 +1 @@ +86dc427794b9c5574b9ac3b5a78cbd2ce7d2d46e737f2a4b28eb8a1bbdace8b9 \ No newline at end of file diff --git a/samples/transformers-auto-model/microsoft/phi-2/graph_net.json b/samples/transformers-auto-model/microsoft/phi-2/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/phi-2/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/microsoft/phi-2/input_meta.py b/samples/transformers-auto-model/microsoft/phi-2/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/microsoft/phi-2/input_tensor_constraints.py b/samples/transformers-auto-model/microsoft/phi-2/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/microsoft/phi-2/model.py b/samples/transformers-auto-model/microsoft/phi-2/model.py new file mode 100644 index 000000000..fc705d542 --- /dev/null +++ b/samples/transformers-auto-model/microsoft/phi-2/model.py @@ -0,0 +1,6500 @@ +import torch + +from torch import device + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_inputs_embeds_: torch.Tensor, + L_attention_mask_: torch.Tensor, + L_self_modules_rotary_emb_buffers_inv_freq_: torch.Tensor, + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_24_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_25_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_26_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_27_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_28_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_29_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_30_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_self_attn_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_self_attn_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_mlp_modules_fc1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_mlp_modules_fc1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_mlp_modules_fc2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_31_modules_mlp_modules_fc2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_final_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_final_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_inputs_embeds_ = L_inputs_embeds_ + l_attention_mask_ = L_attention_mask_ + l_self_modules_rotary_emb_buffers_inv_freq_ = ( + L_self_modules_rotary_emb_buffers_inv_freq_ + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_24_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_24_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_24_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_24_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_24_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_24_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_24_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_24_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_24_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_24_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_24_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_24_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_24_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_25_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_25_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_25_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_25_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_25_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_25_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_25_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_25_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_25_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_25_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_25_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_25_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_25_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_26_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_26_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_26_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_26_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_26_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_26_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_26_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_26_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_26_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_26_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_26_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_26_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_26_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_27_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_27_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_27_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_27_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_27_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_27_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_27_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_27_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_27_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_27_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_27_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_27_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_27_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_28_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_28_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_28_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_28_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_28_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_28_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_28_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_28_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_28_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_28_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_28_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_28_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_28_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_29_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_29_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_29_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_29_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_29_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_29_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_29_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_29_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_29_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_29_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_29_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_29_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_29_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_30_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_30_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_30_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_30_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_30_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_30_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_30_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_30_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_30_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_30_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_30_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_30_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_30_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_31_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_layers_modules_31_modules_self_attn_modules_dense_parameters_weight_ = L_self_modules_layers_modules_31_modules_self_attn_modules_dense_parameters_weight_ + l_self_modules_layers_modules_31_modules_self_attn_modules_dense_parameters_bias_ = L_self_modules_layers_modules_31_modules_self_attn_modules_dense_parameters_bias_ + l_self_modules_layers_modules_31_modules_mlp_modules_fc1_parameters_weight_ = ( + L_self_modules_layers_modules_31_modules_mlp_modules_fc1_parameters_weight_ + ) + l_self_modules_layers_modules_31_modules_mlp_modules_fc1_parameters_bias_ = ( + L_self_modules_layers_modules_31_modules_mlp_modules_fc1_parameters_bias_ + ) + l_self_modules_layers_modules_31_modules_mlp_modules_fc2_parameters_weight_ = ( + L_self_modules_layers_modules_31_modules_mlp_modules_fc2_parameters_weight_ + ) + l_self_modules_layers_modules_31_modules_mlp_modules_fc2_parameters_bias_ = ( + L_self_modules_layers_modules_31_modules_mlp_modules_fc2_parameters_bias_ + ) + l_self_modules_final_layernorm_parameters_weight_ = ( + L_self_modules_final_layernorm_parameters_weight_ + ) + l_self_modules_final_layernorm_parameters_bias_ = ( + L_self_modules_final_layernorm_parameters_bias_ + ) + cache_position = torch.arange(0, 2, device=device(type="cuda", index=0)) + position_ids = cache_position.unsqueeze(0) + attention_mask = l_attention_mask_.to( + device=device(type="cuda", index=0), dtype=torch.bool + ) + l_attention_mask_ = None + mask_indices = torch.arange(2, device=device(type="cuda", index=0)) + mask_indices += 0 + mask_indices_1 = mask_indices + mask_indices = None + local_padding_mask = attention_mask[(slice(None, None, None), mask_indices_1)] + attention_mask = mask_indices_1 = None + kv_arange = torch.arange(2, device=device(type="cuda", index=0)) + kv_arange += 0 + kv_arange_1 = kv_arange + kv_arange = None + reshaped_cache_position = cache_position.view(-1, 1) + cache_position = None + causal_mask = kv_arange_1 <= reshaped_cache_position + kv_arange_1 = reshaped_cache_position = None + getitem_1 = causal_mask[ + (None, None, slice(None, None, None), slice(None, None, None)) + ] + causal_mask = None + causal_mask_1 = getitem_1.expand(1, -1, -1, -1) + getitem_1 = None + getitem_2 = local_padding_mask[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + local_padding_mask = None + causal_mask_2 = causal_mask_1 * getitem_2 + causal_mask_1 = getitem_2 = None + inputs_embeds = torch.nn.functional.dropout(l_inputs_embeds_, 0.0, False, False) + l_inputs_embeds_ = None + _set_grad_enabled = torch._C._set_grad_enabled(False) + _set_grad_enabled = None + getitem_3 = l_self_modules_rotary_emb_buffers_inv_freq_[ + (None, slice(None, None, None), None) + ] + l_self_modules_rotary_emb_buffers_inv_freq_ = None + float_1 = getitem_3.float() + getitem_3 = None + expand_1 = float_1.expand(1, -1, 1) + float_1 = None + inv_freq_expanded = expand_1.to(device(type="cuda", index=0)) + expand_1 = None + getitem_4 = position_ids[ + (slice(None, None, None), None, slice(None, None, None)) + ] + position_ids = None + position_ids_expanded = getitem_4.float() + getitem_4 = None + float_3 = inv_freq_expanded.float() + inv_freq_expanded = None + float_4 = position_ids_expanded.float() + position_ids_expanded = None + matmul = float_3 @ float_4 + float_3 = float_4 = None + freqs = matmul.transpose(1, 2) + matmul = None + emb = torch.cat((freqs, freqs), dim=-1) + freqs = None + cos = emb.cos() + cos_1 = cos * 1.0 + cos = None + sin = emb.sin() + emb = None + sin_1 = sin * 1.0 + sin = None + cos_2 = cos_1.to(dtype=torch.float16) + cos_1 = None + sin_2 = sin_1.to(dtype=torch.float16) + sin_1 = None + _set_grad_enabled_1 = torch._C._set_grad_enabled(True) + _set_grad_enabled_1 = None + _log_api_usage_once = torch._C._log_api_usage_once("python.nn_module") + _log_api_usage_once = None + hidden_states = torch.nn.functional.layer_norm( + inputs_embeds, + (2560,), + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_ + ) = None + linear = torch._C._nn.linear( + hidden_states, + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_1 = linear.view((1, 2, -1, 80)) + linear = None + query_states = view_1.transpose(1, 2) + view_1 = None + linear_1 = torch._C._nn.linear( + hidden_states, + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_2 = linear_1.view((1, 2, -1, 80)) + linear_1 = None + key_states = view_2.transpose(1, 2) + view_2 = None + linear_2 = torch._C._nn.linear( + hidden_states, + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_3 = linear_2.view((1, 2, -1, 80)) + linear_2 = None + value_states = view_3.transpose(1, 2) + view_3 = None + query_rot = query_states[(Ellipsis, slice(None, 32, None))] + query_pass = query_states[(Ellipsis, slice(32, None, None))] + query_states = None + key_rot = key_states[(Ellipsis, slice(None, 32, None))] + key_pass = key_states[(Ellipsis, slice(32, None, None))] + key_states = None + cos_3 = cos_2.unsqueeze(1) + sin_3 = sin_2.unsqueeze(1) + mul_3 = query_rot * cos_3 + x1 = query_rot[(Ellipsis, slice(None, 16, None))] + x2 = query_rot[(Ellipsis, slice(16, None, None))] + query_rot = None + neg = -x2 + x2 = None + cat_1 = torch.cat((neg, x1), dim=-1) + neg = x1 = None + mul_4 = cat_1 * sin_3 + cat_1 = None + q_embed = mul_3 + mul_4 + mul_3 = mul_4 = None + mul_5 = key_rot * cos_3 + cos_3 = None + x1_1 = key_rot[(Ellipsis, slice(None, 16, None))] + x2_1 = key_rot[(Ellipsis, slice(16, None, None))] + key_rot = None + neg_1 = -x2_1 + x2_1 = None + cat_2 = torch.cat((neg_1, x1_1), dim=-1) + neg_1 = x1_1 = None + mul_6 = cat_2 * sin_3 + cat_2 = sin_3 = None + k_embed = mul_5 + mul_6 + mul_5 = mul_6 = None + query_states_1 = torch.cat((q_embed, query_pass), dim=-1) + q_embed = query_pass = None + key_states_1 = torch.cat((k_embed, key_pass), dim=-1) + k_embed = key_pass = None + attention_mask_1 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query = query_states_1.contiguous() + query_states_1 = None + key = key_states_1.contiguous() + value = value_states.contiguous() + attn_output = torch._C._nn.scaled_dot_product_attention( + query, + key, + value, + attn_mask=attention_mask_1, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query = key = value = attention_mask_1 = None + transpose_4 = attn_output.transpose(1, 2) + attn_output = None + attn_output_1 = transpose_4.contiguous() + transpose_4 = None + reshape = attn_output_1.reshape(1, 2, -1) + attn_output_1 = None + attn_output_2 = reshape.contiguous() + reshape = None + attn_output_3 = torch._C._nn.linear( + attn_output_2, + l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_2 = l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs = torch.nn.functional.dropout(attn_output_3, 0.1, False, False) + attn_output_3 = None + hidden_states_1 = torch._C._nn.linear( + hidden_states, + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states = ( + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_7 = 0.5 * hidden_states_1 + pow_1 = torch.pow(hidden_states_1, 3.0) + mul_8 = 0.044715 * pow_1 + pow_1 = None + add_2 = hidden_states_1 + mul_8 + hidden_states_1 = mul_8 = None + mul_9 = 0.7978845608028654 * add_2 + add_2 = None + tanh = torch.tanh(mul_9) + mul_9 = None + add_3 = 1.0 + tanh + tanh = None + hidden_states_2 = mul_7 * add_3 + mul_7 = add_3 = None + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_2 = ( + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states = torch.nn.functional.dropout( + hidden_states_3, 0.1, False, False + ) + hidden_states_3 = None + add_4 = attn_outputs + feed_forward_hidden_states + attn_outputs = feed_forward_hidden_states = None + hidden_states_4 = add_4 + inputs_embeds + add_4 = inputs_embeds = None + hidden_states_5 = torch.nn.functional.layer_norm( + hidden_states_4, + (2560,), + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_ + ) = None + linear_6 = torch._C._nn.linear( + hidden_states_5, + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_4 = linear_6.view((1, 2, -1, 80)) + linear_6 = None + query_states_2 = view_4.transpose(1, 2) + view_4 = None + linear_7 = torch._C._nn.linear( + hidden_states_5, + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_5 = linear_7.view((1, 2, -1, 80)) + linear_7 = None + key_states_2 = view_5.transpose(1, 2) + view_5 = None + linear_8 = torch._C._nn.linear( + hidden_states_5, + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_6 = linear_8.view((1, 2, -1, 80)) + linear_8 = None + value_states_1 = view_6.transpose(1, 2) + view_6 = None + query_rot_1 = query_states_2[(Ellipsis, slice(None, 32, None))] + query_pass_1 = query_states_2[(Ellipsis, slice(32, None, None))] + query_states_2 = None + key_rot_1 = key_states_2[(Ellipsis, slice(None, 32, None))] + key_pass_1 = key_states_2[(Ellipsis, slice(32, None, None))] + key_states_2 = None + cos_4 = cos_2.unsqueeze(1) + sin_4 = sin_2.unsqueeze(1) + mul_11 = query_rot_1 * cos_4 + x1_2 = query_rot_1[(Ellipsis, slice(None, 16, None))] + x2_2 = query_rot_1[(Ellipsis, slice(16, None, None))] + query_rot_1 = None + neg_2 = -x2_2 + x2_2 = None + cat_5 = torch.cat((neg_2, x1_2), dim=-1) + neg_2 = x1_2 = None + mul_12 = cat_5 * sin_4 + cat_5 = None + q_embed_1 = mul_11 + mul_12 + mul_11 = mul_12 = None + mul_13 = key_rot_1 * cos_4 + cos_4 = None + x1_3 = key_rot_1[(Ellipsis, slice(None, 16, None))] + x2_3 = key_rot_1[(Ellipsis, slice(16, None, None))] + key_rot_1 = None + neg_3 = -x2_3 + x2_3 = None + cat_6 = torch.cat((neg_3, x1_3), dim=-1) + neg_3 = x1_3 = None + mul_14 = cat_6 * sin_4 + cat_6 = sin_4 = None + k_embed_1 = mul_13 + mul_14 + mul_13 = mul_14 = None + query_states_3 = torch.cat((q_embed_1, query_pass_1), dim=-1) + q_embed_1 = query_pass_1 = None + key_states_3 = torch.cat((k_embed_1, key_pass_1), dim=-1) + k_embed_1 = key_pass_1 = None + attention_mask_2 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_1 = query_states_3.contiguous() + query_states_3 = None + key_1 = key_states_3.contiguous() + value_1 = value_states_1.contiguous() + attn_output_4 = torch._C._nn.scaled_dot_product_attention( + query_1, + key_1, + value_1, + attn_mask=attention_mask_2, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_1 = key_1 = value_1 = attention_mask_2 = None + transpose_8 = attn_output_4.transpose(1, 2) + attn_output_4 = None + attn_output_5 = transpose_8.contiguous() + transpose_8 = None + reshape_1 = attn_output_5.reshape(1, 2, -1) + attn_output_5 = None + attn_output_6 = reshape_1.contiguous() + reshape_1 = None + attn_output_7 = torch._C._nn.linear( + attn_output_6, + l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_6 = l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_1 = torch.nn.functional.dropout(attn_output_7, 0.1, False, False) + attn_output_7 = None + hidden_states_6 = torch._C._nn.linear( + hidden_states_5, + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_5 = ( + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_15 = 0.5 * hidden_states_6 + pow_2 = torch.pow(hidden_states_6, 3.0) + mul_16 = 0.044715 * pow_2 + pow_2 = None + add_8 = hidden_states_6 + mul_16 + hidden_states_6 = mul_16 = None + mul_17 = 0.7978845608028654 * add_8 + add_8 = None + tanh_1 = torch.tanh(mul_17) + mul_17 = None + add_9 = 1.0 + tanh_1 + tanh_1 = None + hidden_states_7 = mul_15 * add_9 + mul_15 = add_9 = None + hidden_states_8 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_7 = ( + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_1 = torch.nn.functional.dropout( + hidden_states_8, 0.1, False, False + ) + hidden_states_8 = None + add_10 = attn_outputs_1 + feed_forward_hidden_states_1 + attn_outputs_1 = feed_forward_hidden_states_1 = None + hidden_states_9 = add_10 + hidden_states_4 + add_10 = hidden_states_4 = None + hidden_states_10 = torch.nn.functional.layer_norm( + hidden_states_9, + (2560,), + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_ + ) = None + linear_12 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_7 = linear_12.view((1, 2, -1, 80)) + linear_12 = None + query_states_4 = view_7.transpose(1, 2) + view_7 = None + linear_13 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_8 = linear_13.view((1, 2, -1, 80)) + linear_13 = None + key_states_4 = view_8.transpose(1, 2) + view_8 = None + linear_14 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_9 = linear_14.view((1, 2, -1, 80)) + linear_14 = None + value_states_2 = view_9.transpose(1, 2) + view_9 = None + query_rot_2 = query_states_4[(Ellipsis, slice(None, 32, None))] + query_pass_2 = query_states_4[(Ellipsis, slice(32, None, None))] + query_states_4 = None + key_rot_2 = key_states_4[(Ellipsis, slice(None, 32, None))] + key_pass_2 = key_states_4[(Ellipsis, slice(32, None, None))] + key_states_4 = None + cos_5 = cos_2.unsqueeze(1) + sin_5 = sin_2.unsqueeze(1) + mul_19 = query_rot_2 * cos_5 + x1_4 = query_rot_2[(Ellipsis, slice(None, 16, None))] + x2_4 = query_rot_2[(Ellipsis, slice(16, None, None))] + query_rot_2 = None + neg_4 = -x2_4 + x2_4 = None + cat_9 = torch.cat((neg_4, x1_4), dim=-1) + neg_4 = x1_4 = None + mul_20 = cat_9 * sin_5 + cat_9 = None + q_embed_2 = mul_19 + mul_20 + mul_19 = mul_20 = None + mul_21 = key_rot_2 * cos_5 + cos_5 = None + x1_5 = key_rot_2[(Ellipsis, slice(None, 16, None))] + x2_5 = key_rot_2[(Ellipsis, slice(16, None, None))] + key_rot_2 = None + neg_5 = -x2_5 + x2_5 = None + cat_10 = torch.cat((neg_5, x1_5), dim=-1) + neg_5 = x1_5 = None + mul_22 = cat_10 * sin_5 + cat_10 = sin_5 = None + k_embed_2 = mul_21 + mul_22 + mul_21 = mul_22 = None + query_states_5 = torch.cat((q_embed_2, query_pass_2), dim=-1) + q_embed_2 = query_pass_2 = None + key_states_5 = torch.cat((k_embed_2, key_pass_2), dim=-1) + k_embed_2 = key_pass_2 = None + attention_mask_3 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_2 = query_states_5.contiguous() + query_states_5 = None + key_2 = key_states_5.contiguous() + value_2 = value_states_2.contiguous() + attn_output_8 = torch._C._nn.scaled_dot_product_attention( + query_2, + key_2, + value_2, + attn_mask=attention_mask_3, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_2 = key_2 = value_2 = attention_mask_3 = None + transpose_12 = attn_output_8.transpose(1, 2) + attn_output_8 = None + attn_output_9 = transpose_12.contiguous() + transpose_12 = None + reshape_2 = attn_output_9.reshape(1, 2, -1) + attn_output_9 = None + attn_output_10 = reshape_2.contiguous() + reshape_2 = None + attn_output_11 = torch._C._nn.linear( + attn_output_10, + l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_10 = l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_2 = torch.nn.functional.dropout(attn_output_11, 0.1, False, False) + attn_output_11 = None + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_10 = ( + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_23 = 0.5 * hidden_states_11 + pow_3 = torch.pow(hidden_states_11, 3.0) + mul_24 = 0.044715 * pow_3 + pow_3 = None + add_14 = hidden_states_11 + mul_24 + hidden_states_11 = mul_24 = None + mul_25 = 0.7978845608028654 * add_14 + add_14 = None + tanh_2 = torch.tanh(mul_25) + mul_25 = None + add_15 = 1.0 + tanh_2 + tanh_2 = None + hidden_states_12 = mul_23 * add_15 + mul_23 = add_15 = None + hidden_states_13 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_12 = ( + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_2 = torch.nn.functional.dropout( + hidden_states_13, 0.1, False, False + ) + hidden_states_13 = None + add_16 = attn_outputs_2 + feed_forward_hidden_states_2 + attn_outputs_2 = feed_forward_hidden_states_2 = None + hidden_states_14 = add_16 + hidden_states_9 + add_16 = hidden_states_9 = None + hidden_states_15 = torch.nn.functional.layer_norm( + hidden_states_14, + (2560,), + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_ + ) = None + linear_18 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_10 = linear_18.view((1, 2, -1, 80)) + linear_18 = None + query_states_6 = view_10.transpose(1, 2) + view_10 = None + linear_19 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_11 = linear_19.view((1, 2, -1, 80)) + linear_19 = None + key_states_6 = view_11.transpose(1, 2) + view_11 = None + linear_20 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_12 = linear_20.view((1, 2, -1, 80)) + linear_20 = None + value_states_3 = view_12.transpose(1, 2) + view_12 = None + query_rot_3 = query_states_6[(Ellipsis, slice(None, 32, None))] + query_pass_3 = query_states_6[(Ellipsis, slice(32, None, None))] + query_states_6 = None + key_rot_3 = key_states_6[(Ellipsis, slice(None, 32, None))] + key_pass_3 = key_states_6[(Ellipsis, slice(32, None, None))] + key_states_6 = None + cos_6 = cos_2.unsqueeze(1) + sin_6 = sin_2.unsqueeze(1) + mul_27 = query_rot_3 * cos_6 + x1_6 = query_rot_3[(Ellipsis, slice(None, 16, None))] + x2_6 = query_rot_3[(Ellipsis, slice(16, None, None))] + query_rot_3 = None + neg_6 = -x2_6 + x2_6 = None + cat_13 = torch.cat((neg_6, x1_6), dim=-1) + neg_6 = x1_6 = None + mul_28 = cat_13 * sin_6 + cat_13 = None + q_embed_3 = mul_27 + mul_28 + mul_27 = mul_28 = None + mul_29 = key_rot_3 * cos_6 + cos_6 = None + x1_7 = key_rot_3[(Ellipsis, slice(None, 16, None))] + x2_7 = key_rot_3[(Ellipsis, slice(16, None, None))] + key_rot_3 = None + neg_7 = -x2_7 + x2_7 = None + cat_14 = torch.cat((neg_7, x1_7), dim=-1) + neg_7 = x1_7 = None + mul_30 = cat_14 * sin_6 + cat_14 = sin_6 = None + k_embed_3 = mul_29 + mul_30 + mul_29 = mul_30 = None + query_states_7 = torch.cat((q_embed_3, query_pass_3), dim=-1) + q_embed_3 = query_pass_3 = None + key_states_7 = torch.cat((k_embed_3, key_pass_3), dim=-1) + k_embed_3 = key_pass_3 = None + attention_mask_4 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_3 = query_states_7.contiguous() + query_states_7 = None + key_3 = key_states_7.contiguous() + value_3 = value_states_3.contiguous() + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_3, + key_3, + value_3, + attn_mask=attention_mask_4, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_3 = key_3 = value_3 = attention_mask_4 = None + transpose_16 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_13 = transpose_16.contiguous() + transpose_16 = None + reshape_3 = attn_output_13.reshape(1, 2, -1) + attn_output_13 = None + attn_output_14 = reshape_3.contiguous() + reshape_3 = None + attn_output_15 = torch._C._nn.linear( + attn_output_14, + l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_14 = l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_3 = torch.nn.functional.dropout(attn_output_15, 0.1, False, False) + attn_output_15 = None + hidden_states_16 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_15 = ( + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_31 = 0.5 * hidden_states_16 + pow_4 = torch.pow(hidden_states_16, 3.0) + mul_32 = 0.044715 * pow_4 + pow_4 = None + add_20 = hidden_states_16 + mul_32 + hidden_states_16 = mul_32 = None + mul_33 = 0.7978845608028654 * add_20 + add_20 = None + tanh_3 = torch.tanh(mul_33) + mul_33 = None + add_21 = 1.0 + tanh_3 + tanh_3 = None + hidden_states_17 = mul_31 * add_21 + mul_31 = add_21 = None + hidden_states_18 = torch._C._nn.linear( + hidden_states_17, + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_17 = ( + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_3 = torch.nn.functional.dropout( + hidden_states_18, 0.1, False, False + ) + hidden_states_18 = None + add_22 = attn_outputs_3 + feed_forward_hidden_states_3 + attn_outputs_3 = feed_forward_hidden_states_3 = None + hidden_states_19 = add_22 + hidden_states_14 + add_22 = hidden_states_14 = None + hidden_states_20 = torch.nn.functional.layer_norm( + hidden_states_19, + (2560,), + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_ + ) = None + linear_24 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_13 = linear_24.view((1, 2, -1, 80)) + linear_24 = None + query_states_8 = view_13.transpose(1, 2) + view_13 = None + linear_25 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_14 = linear_25.view((1, 2, -1, 80)) + linear_25 = None + key_states_8 = view_14.transpose(1, 2) + view_14 = None + linear_26 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_15 = linear_26.view((1, 2, -1, 80)) + linear_26 = None + value_states_4 = view_15.transpose(1, 2) + view_15 = None + query_rot_4 = query_states_8[(Ellipsis, slice(None, 32, None))] + query_pass_4 = query_states_8[(Ellipsis, slice(32, None, None))] + query_states_8 = None + key_rot_4 = key_states_8[(Ellipsis, slice(None, 32, None))] + key_pass_4 = key_states_8[(Ellipsis, slice(32, None, None))] + key_states_8 = None + cos_7 = cos_2.unsqueeze(1) + sin_7 = sin_2.unsqueeze(1) + mul_35 = query_rot_4 * cos_7 + x1_8 = query_rot_4[(Ellipsis, slice(None, 16, None))] + x2_8 = query_rot_4[(Ellipsis, slice(16, None, None))] + query_rot_4 = None + neg_8 = -x2_8 + x2_8 = None + cat_17 = torch.cat((neg_8, x1_8), dim=-1) + neg_8 = x1_8 = None + mul_36 = cat_17 * sin_7 + cat_17 = None + q_embed_4 = mul_35 + mul_36 + mul_35 = mul_36 = None + mul_37 = key_rot_4 * cos_7 + cos_7 = None + x1_9 = key_rot_4[(Ellipsis, slice(None, 16, None))] + x2_9 = key_rot_4[(Ellipsis, slice(16, None, None))] + key_rot_4 = None + neg_9 = -x2_9 + x2_9 = None + cat_18 = torch.cat((neg_9, x1_9), dim=-1) + neg_9 = x1_9 = None + mul_38 = cat_18 * sin_7 + cat_18 = sin_7 = None + k_embed_4 = mul_37 + mul_38 + mul_37 = mul_38 = None + query_states_9 = torch.cat((q_embed_4, query_pass_4), dim=-1) + q_embed_4 = query_pass_4 = None + key_states_9 = torch.cat((k_embed_4, key_pass_4), dim=-1) + k_embed_4 = key_pass_4 = None + attention_mask_5 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_4 = query_states_9.contiguous() + query_states_9 = None + key_4 = key_states_9.contiguous() + value_4 = value_states_4.contiguous() + attn_output_16 = torch._C._nn.scaled_dot_product_attention( + query_4, + key_4, + value_4, + attn_mask=attention_mask_5, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_4 = key_4 = value_4 = attention_mask_5 = None + transpose_20 = attn_output_16.transpose(1, 2) + attn_output_16 = None + attn_output_17 = transpose_20.contiguous() + transpose_20 = None + reshape_4 = attn_output_17.reshape(1, 2, -1) + attn_output_17 = None + attn_output_18 = reshape_4.contiguous() + reshape_4 = None + attn_output_19 = torch._C._nn.linear( + attn_output_18, + l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_18 = l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_4 = torch.nn.functional.dropout(attn_output_19, 0.1, False, False) + attn_output_19 = None + hidden_states_21 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_20 = ( + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_39 = 0.5 * hidden_states_21 + pow_5 = torch.pow(hidden_states_21, 3.0) + mul_40 = 0.044715 * pow_5 + pow_5 = None + add_26 = hidden_states_21 + mul_40 + hidden_states_21 = mul_40 = None + mul_41 = 0.7978845608028654 * add_26 + add_26 = None + tanh_4 = torch.tanh(mul_41) + mul_41 = None + add_27 = 1.0 + tanh_4 + tanh_4 = None + hidden_states_22 = mul_39 * add_27 + mul_39 = add_27 = None + hidden_states_23 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_22 = ( + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_4 = torch.nn.functional.dropout( + hidden_states_23, 0.1, False, False + ) + hidden_states_23 = None + add_28 = attn_outputs_4 + feed_forward_hidden_states_4 + attn_outputs_4 = feed_forward_hidden_states_4 = None + hidden_states_24 = add_28 + hidden_states_19 + add_28 = hidden_states_19 = None + hidden_states_25 = torch.nn.functional.layer_norm( + hidden_states_24, + (2560,), + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_ + ) = None + linear_30 = torch._C._nn.linear( + hidden_states_25, + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_16 = linear_30.view((1, 2, -1, 80)) + linear_30 = None + query_states_10 = view_16.transpose(1, 2) + view_16 = None + linear_31 = torch._C._nn.linear( + hidden_states_25, + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_17 = linear_31.view((1, 2, -1, 80)) + linear_31 = None + key_states_10 = view_17.transpose(1, 2) + view_17 = None + linear_32 = torch._C._nn.linear( + hidden_states_25, + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_18 = linear_32.view((1, 2, -1, 80)) + linear_32 = None + value_states_5 = view_18.transpose(1, 2) + view_18 = None + query_rot_5 = query_states_10[(Ellipsis, slice(None, 32, None))] + query_pass_5 = query_states_10[(Ellipsis, slice(32, None, None))] + query_states_10 = None + key_rot_5 = key_states_10[(Ellipsis, slice(None, 32, None))] + key_pass_5 = key_states_10[(Ellipsis, slice(32, None, None))] + key_states_10 = None + cos_8 = cos_2.unsqueeze(1) + sin_8 = sin_2.unsqueeze(1) + mul_43 = query_rot_5 * cos_8 + x1_10 = query_rot_5[(Ellipsis, slice(None, 16, None))] + x2_10 = query_rot_5[(Ellipsis, slice(16, None, None))] + query_rot_5 = None + neg_10 = -x2_10 + x2_10 = None + cat_21 = torch.cat((neg_10, x1_10), dim=-1) + neg_10 = x1_10 = None + mul_44 = cat_21 * sin_8 + cat_21 = None + q_embed_5 = mul_43 + mul_44 + mul_43 = mul_44 = None + mul_45 = key_rot_5 * cos_8 + cos_8 = None + x1_11 = key_rot_5[(Ellipsis, slice(None, 16, None))] + x2_11 = key_rot_5[(Ellipsis, slice(16, None, None))] + key_rot_5 = None + neg_11 = -x2_11 + x2_11 = None + cat_22 = torch.cat((neg_11, x1_11), dim=-1) + neg_11 = x1_11 = None + mul_46 = cat_22 * sin_8 + cat_22 = sin_8 = None + k_embed_5 = mul_45 + mul_46 + mul_45 = mul_46 = None + query_states_11 = torch.cat((q_embed_5, query_pass_5), dim=-1) + q_embed_5 = query_pass_5 = None + key_states_11 = torch.cat((k_embed_5, key_pass_5), dim=-1) + k_embed_5 = key_pass_5 = None + attention_mask_6 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_5 = query_states_11.contiguous() + query_states_11 = None + key_5 = key_states_11.contiguous() + value_5 = value_states_5.contiguous() + attn_output_20 = torch._C._nn.scaled_dot_product_attention( + query_5, + key_5, + value_5, + attn_mask=attention_mask_6, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_5 = key_5 = value_5 = attention_mask_6 = None + transpose_24 = attn_output_20.transpose(1, 2) + attn_output_20 = None + attn_output_21 = transpose_24.contiguous() + transpose_24 = None + reshape_5 = attn_output_21.reshape(1, 2, -1) + attn_output_21 = None + attn_output_22 = reshape_5.contiguous() + reshape_5 = None + attn_output_23 = torch._C._nn.linear( + attn_output_22, + l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_22 = l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_5 = torch.nn.functional.dropout(attn_output_23, 0.1, False, False) + attn_output_23 = None + hidden_states_26 = torch._C._nn.linear( + hidden_states_25, + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_25 = ( + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_47 = 0.5 * hidden_states_26 + pow_6 = torch.pow(hidden_states_26, 3.0) + mul_48 = 0.044715 * pow_6 + pow_6 = None + add_32 = hidden_states_26 + mul_48 + hidden_states_26 = mul_48 = None + mul_49 = 0.7978845608028654 * add_32 + add_32 = None + tanh_5 = torch.tanh(mul_49) + mul_49 = None + add_33 = 1.0 + tanh_5 + tanh_5 = None + hidden_states_27 = mul_47 * add_33 + mul_47 = add_33 = None + hidden_states_28 = torch._C._nn.linear( + hidden_states_27, + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_27 = ( + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_5 = torch.nn.functional.dropout( + hidden_states_28, 0.1, False, False + ) + hidden_states_28 = None + add_34 = attn_outputs_5 + feed_forward_hidden_states_5 + attn_outputs_5 = feed_forward_hidden_states_5 = None + hidden_states_29 = add_34 + hidden_states_24 + add_34 = hidden_states_24 = None + hidden_states_30 = torch.nn.functional.layer_norm( + hidden_states_29, + (2560,), + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_ + ) = None + linear_36 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_19 = linear_36.view((1, 2, -1, 80)) + linear_36 = None + query_states_12 = view_19.transpose(1, 2) + view_19 = None + linear_37 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_20 = linear_37.view((1, 2, -1, 80)) + linear_37 = None + key_states_12 = view_20.transpose(1, 2) + view_20 = None + linear_38 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_21 = linear_38.view((1, 2, -1, 80)) + linear_38 = None + value_states_6 = view_21.transpose(1, 2) + view_21 = None + query_rot_6 = query_states_12[(Ellipsis, slice(None, 32, None))] + query_pass_6 = query_states_12[(Ellipsis, slice(32, None, None))] + query_states_12 = None + key_rot_6 = key_states_12[(Ellipsis, slice(None, 32, None))] + key_pass_6 = key_states_12[(Ellipsis, slice(32, None, None))] + key_states_12 = None + cos_9 = cos_2.unsqueeze(1) + sin_9 = sin_2.unsqueeze(1) + mul_51 = query_rot_6 * cos_9 + x1_12 = query_rot_6[(Ellipsis, slice(None, 16, None))] + x2_12 = query_rot_6[(Ellipsis, slice(16, None, None))] + query_rot_6 = None + neg_12 = -x2_12 + x2_12 = None + cat_25 = torch.cat((neg_12, x1_12), dim=-1) + neg_12 = x1_12 = None + mul_52 = cat_25 * sin_9 + cat_25 = None + q_embed_6 = mul_51 + mul_52 + mul_51 = mul_52 = None + mul_53 = key_rot_6 * cos_9 + cos_9 = None + x1_13 = key_rot_6[(Ellipsis, slice(None, 16, None))] + x2_13 = key_rot_6[(Ellipsis, slice(16, None, None))] + key_rot_6 = None + neg_13 = -x2_13 + x2_13 = None + cat_26 = torch.cat((neg_13, x1_13), dim=-1) + neg_13 = x1_13 = None + mul_54 = cat_26 * sin_9 + cat_26 = sin_9 = None + k_embed_6 = mul_53 + mul_54 + mul_53 = mul_54 = None + query_states_13 = torch.cat((q_embed_6, query_pass_6), dim=-1) + q_embed_6 = query_pass_6 = None + key_states_13 = torch.cat((k_embed_6, key_pass_6), dim=-1) + k_embed_6 = key_pass_6 = None + attention_mask_7 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_6 = query_states_13.contiguous() + query_states_13 = None + key_6 = key_states_13.contiguous() + value_6 = value_states_6.contiguous() + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + query_6, + key_6, + value_6, + attn_mask=attention_mask_7, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_6 = key_6 = value_6 = attention_mask_7 = None + transpose_28 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_25 = transpose_28.contiguous() + transpose_28 = None + reshape_6 = attn_output_25.reshape(1, 2, -1) + attn_output_25 = None + attn_output_26 = reshape_6.contiguous() + reshape_6 = None + attn_output_27 = torch._C._nn.linear( + attn_output_26, + l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_26 = l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_6 = torch.nn.functional.dropout(attn_output_27, 0.1, False, False) + attn_output_27 = None + hidden_states_31 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_30 = ( + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_55 = 0.5 * hidden_states_31 + pow_7 = torch.pow(hidden_states_31, 3.0) + mul_56 = 0.044715 * pow_7 + pow_7 = None + add_38 = hidden_states_31 + mul_56 + hidden_states_31 = mul_56 = None + mul_57 = 0.7978845608028654 * add_38 + add_38 = None + tanh_6 = torch.tanh(mul_57) + mul_57 = None + add_39 = 1.0 + tanh_6 + tanh_6 = None + hidden_states_32 = mul_55 * add_39 + mul_55 = add_39 = None + hidden_states_33 = torch._C._nn.linear( + hidden_states_32, + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_32 = ( + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_6 = torch.nn.functional.dropout( + hidden_states_33, 0.1, False, False + ) + hidden_states_33 = None + add_40 = attn_outputs_6 + feed_forward_hidden_states_6 + attn_outputs_6 = feed_forward_hidden_states_6 = None + hidden_states_34 = add_40 + hidden_states_29 + add_40 = hidden_states_29 = None + hidden_states_35 = torch.nn.functional.layer_norm( + hidden_states_34, + (2560,), + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_ + ) = None + linear_42 = torch._C._nn.linear( + hidden_states_35, + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_22 = linear_42.view((1, 2, -1, 80)) + linear_42 = None + query_states_14 = view_22.transpose(1, 2) + view_22 = None + linear_43 = torch._C._nn.linear( + hidden_states_35, + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_23 = linear_43.view((1, 2, -1, 80)) + linear_43 = None + key_states_14 = view_23.transpose(1, 2) + view_23 = None + linear_44 = torch._C._nn.linear( + hidden_states_35, + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_24 = linear_44.view((1, 2, -1, 80)) + linear_44 = None + value_states_7 = view_24.transpose(1, 2) + view_24 = None + query_rot_7 = query_states_14[(Ellipsis, slice(None, 32, None))] + query_pass_7 = query_states_14[(Ellipsis, slice(32, None, None))] + query_states_14 = None + key_rot_7 = key_states_14[(Ellipsis, slice(None, 32, None))] + key_pass_7 = key_states_14[(Ellipsis, slice(32, None, None))] + key_states_14 = None + cos_10 = cos_2.unsqueeze(1) + sin_10 = sin_2.unsqueeze(1) + mul_59 = query_rot_7 * cos_10 + x1_14 = query_rot_7[(Ellipsis, slice(None, 16, None))] + x2_14 = query_rot_7[(Ellipsis, slice(16, None, None))] + query_rot_7 = None + neg_14 = -x2_14 + x2_14 = None + cat_29 = torch.cat((neg_14, x1_14), dim=-1) + neg_14 = x1_14 = None + mul_60 = cat_29 * sin_10 + cat_29 = None + q_embed_7 = mul_59 + mul_60 + mul_59 = mul_60 = None + mul_61 = key_rot_7 * cos_10 + cos_10 = None + x1_15 = key_rot_7[(Ellipsis, slice(None, 16, None))] + x2_15 = key_rot_7[(Ellipsis, slice(16, None, None))] + key_rot_7 = None + neg_15 = -x2_15 + x2_15 = None + cat_30 = torch.cat((neg_15, x1_15), dim=-1) + neg_15 = x1_15 = None + mul_62 = cat_30 * sin_10 + cat_30 = sin_10 = None + k_embed_7 = mul_61 + mul_62 + mul_61 = mul_62 = None + query_states_15 = torch.cat((q_embed_7, query_pass_7), dim=-1) + q_embed_7 = query_pass_7 = None + key_states_15 = torch.cat((k_embed_7, key_pass_7), dim=-1) + k_embed_7 = key_pass_7 = None + attention_mask_8 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_7 = query_states_15.contiguous() + query_states_15 = None + key_7 = key_states_15.contiguous() + value_7 = value_states_7.contiguous() + attn_output_28 = torch._C._nn.scaled_dot_product_attention( + query_7, + key_7, + value_7, + attn_mask=attention_mask_8, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_7 = key_7 = value_7 = attention_mask_8 = None + transpose_32 = attn_output_28.transpose(1, 2) + attn_output_28 = None + attn_output_29 = transpose_32.contiguous() + transpose_32 = None + reshape_7 = attn_output_29.reshape(1, 2, -1) + attn_output_29 = None + attn_output_30 = reshape_7.contiguous() + reshape_7 = None + attn_output_31 = torch._C._nn.linear( + attn_output_30, + l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_30 = l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_7 = torch.nn.functional.dropout(attn_output_31, 0.1, False, False) + attn_output_31 = None + hidden_states_36 = torch._C._nn.linear( + hidden_states_35, + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_35 = ( + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_63 = 0.5 * hidden_states_36 + pow_8 = torch.pow(hidden_states_36, 3.0) + mul_64 = 0.044715 * pow_8 + pow_8 = None + add_44 = hidden_states_36 + mul_64 + hidden_states_36 = mul_64 = None + mul_65 = 0.7978845608028654 * add_44 + add_44 = None + tanh_7 = torch.tanh(mul_65) + mul_65 = None + add_45 = 1.0 + tanh_7 + tanh_7 = None + hidden_states_37 = mul_63 * add_45 + mul_63 = add_45 = None + hidden_states_38 = torch._C._nn.linear( + hidden_states_37, + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_37 = ( + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_7 = torch.nn.functional.dropout( + hidden_states_38, 0.1, False, False + ) + hidden_states_38 = None + add_46 = attn_outputs_7 + feed_forward_hidden_states_7 + attn_outputs_7 = feed_forward_hidden_states_7 = None + hidden_states_39 = add_46 + hidden_states_34 + add_46 = hidden_states_34 = None + hidden_states_40 = torch.nn.functional.layer_norm( + hidden_states_39, + (2560,), + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_ + ) = None + linear_48 = torch._C._nn.linear( + hidden_states_40, + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_25 = linear_48.view((1, 2, -1, 80)) + linear_48 = None + query_states_16 = view_25.transpose(1, 2) + view_25 = None + linear_49 = torch._C._nn.linear( + hidden_states_40, + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_26 = linear_49.view((1, 2, -1, 80)) + linear_49 = None + key_states_16 = view_26.transpose(1, 2) + view_26 = None + linear_50 = torch._C._nn.linear( + hidden_states_40, + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_27 = linear_50.view((1, 2, -1, 80)) + linear_50 = None + value_states_8 = view_27.transpose(1, 2) + view_27 = None + query_rot_8 = query_states_16[(Ellipsis, slice(None, 32, None))] + query_pass_8 = query_states_16[(Ellipsis, slice(32, None, None))] + query_states_16 = None + key_rot_8 = key_states_16[(Ellipsis, slice(None, 32, None))] + key_pass_8 = key_states_16[(Ellipsis, slice(32, None, None))] + key_states_16 = None + cos_11 = cos_2.unsqueeze(1) + sin_11 = sin_2.unsqueeze(1) + mul_67 = query_rot_8 * cos_11 + x1_16 = query_rot_8[(Ellipsis, slice(None, 16, None))] + x2_16 = query_rot_8[(Ellipsis, slice(16, None, None))] + query_rot_8 = None + neg_16 = -x2_16 + x2_16 = None + cat_33 = torch.cat((neg_16, x1_16), dim=-1) + neg_16 = x1_16 = None + mul_68 = cat_33 * sin_11 + cat_33 = None + q_embed_8 = mul_67 + mul_68 + mul_67 = mul_68 = None + mul_69 = key_rot_8 * cos_11 + cos_11 = None + x1_17 = key_rot_8[(Ellipsis, slice(None, 16, None))] + x2_17 = key_rot_8[(Ellipsis, slice(16, None, None))] + key_rot_8 = None + neg_17 = -x2_17 + x2_17 = None + cat_34 = torch.cat((neg_17, x1_17), dim=-1) + neg_17 = x1_17 = None + mul_70 = cat_34 * sin_11 + cat_34 = sin_11 = None + k_embed_8 = mul_69 + mul_70 + mul_69 = mul_70 = None + query_states_17 = torch.cat((q_embed_8, query_pass_8), dim=-1) + q_embed_8 = query_pass_8 = None + key_states_17 = torch.cat((k_embed_8, key_pass_8), dim=-1) + k_embed_8 = key_pass_8 = None + attention_mask_9 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_8 = query_states_17.contiguous() + query_states_17 = None + key_8 = key_states_17.contiguous() + value_8 = value_states_8.contiguous() + attn_output_32 = torch._C._nn.scaled_dot_product_attention( + query_8, + key_8, + value_8, + attn_mask=attention_mask_9, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_8 = key_8 = value_8 = attention_mask_9 = None + transpose_36 = attn_output_32.transpose(1, 2) + attn_output_32 = None + attn_output_33 = transpose_36.contiguous() + transpose_36 = None + reshape_8 = attn_output_33.reshape(1, 2, -1) + attn_output_33 = None + attn_output_34 = reshape_8.contiguous() + reshape_8 = None + attn_output_35 = torch._C._nn.linear( + attn_output_34, + l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_34 = l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_8 = torch.nn.functional.dropout(attn_output_35, 0.1, False, False) + attn_output_35 = None + hidden_states_41 = torch._C._nn.linear( + hidden_states_40, + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_40 = ( + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_71 = 0.5 * hidden_states_41 + pow_9 = torch.pow(hidden_states_41, 3.0) + mul_72 = 0.044715 * pow_9 + pow_9 = None + add_50 = hidden_states_41 + mul_72 + hidden_states_41 = mul_72 = None + mul_73 = 0.7978845608028654 * add_50 + add_50 = None + tanh_8 = torch.tanh(mul_73) + mul_73 = None + add_51 = 1.0 + tanh_8 + tanh_8 = None + hidden_states_42 = mul_71 * add_51 + mul_71 = add_51 = None + hidden_states_43 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_42 = ( + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_8 = torch.nn.functional.dropout( + hidden_states_43, 0.1, False, False + ) + hidden_states_43 = None + add_52 = attn_outputs_8 + feed_forward_hidden_states_8 + attn_outputs_8 = feed_forward_hidden_states_8 = None + hidden_states_44 = add_52 + hidden_states_39 + add_52 = hidden_states_39 = None + hidden_states_45 = torch.nn.functional.layer_norm( + hidden_states_44, + (2560,), + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_ + ) = None + linear_54 = torch._C._nn.linear( + hidden_states_45, + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_28 = linear_54.view((1, 2, -1, 80)) + linear_54 = None + query_states_18 = view_28.transpose(1, 2) + view_28 = None + linear_55 = torch._C._nn.linear( + hidden_states_45, + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_29 = linear_55.view((1, 2, -1, 80)) + linear_55 = None + key_states_18 = view_29.transpose(1, 2) + view_29 = None + linear_56 = torch._C._nn.linear( + hidden_states_45, + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_30 = linear_56.view((1, 2, -1, 80)) + linear_56 = None + value_states_9 = view_30.transpose(1, 2) + view_30 = None + query_rot_9 = query_states_18[(Ellipsis, slice(None, 32, None))] + query_pass_9 = query_states_18[(Ellipsis, slice(32, None, None))] + query_states_18 = None + key_rot_9 = key_states_18[(Ellipsis, slice(None, 32, None))] + key_pass_9 = key_states_18[(Ellipsis, slice(32, None, None))] + key_states_18 = None + cos_12 = cos_2.unsqueeze(1) + sin_12 = sin_2.unsqueeze(1) + mul_75 = query_rot_9 * cos_12 + x1_18 = query_rot_9[(Ellipsis, slice(None, 16, None))] + x2_18 = query_rot_9[(Ellipsis, slice(16, None, None))] + query_rot_9 = None + neg_18 = -x2_18 + x2_18 = None + cat_37 = torch.cat((neg_18, x1_18), dim=-1) + neg_18 = x1_18 = None + mul_76 = cat_37 * sin_12 + cat_37 = None + q_embed_9 = mul_75 + mul_76 + mul_75 = mul_76 = None + mul_77 = key_rot_9 * cos_12 + cos_12 = None + x1_19 = key_rot_9[(Ellipsis, slice(None, 16, None))] + x2_19 = key_rot_9[(Ellipsis, slice(16, None, None))] + key_rot_9 = None + neg_19 = -x2_19 + x2_19 = None + cat_38 = torch.cat((neg_19, x1_19), dim=-1) + neg_19 = x1_19 = None + mul_78 = cat_38 * sin_12 + cat_38 = sin_12 = None + k_embed_9 = mul_77 + mul_78 + mul_77 = mul_78 = None + query_states_19 = torch.cat((q_embed_9, query_pass_9), dim=-1) + q_embed_9 = query_pass_9 = None + key_states_19 = torch.cat((k_embed_9, key_pass_9), dim=-1) + k_embed_9 = key_pass_9 = None + attention_mask_10 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_9 = query_states_19.contiguous() + query_states_19 = None + key_9 = key_states_19.contiguous() + value_9 = value_states_9.contiguous() + attn_output_36 = torch._C._nn.scaled_dot_product_attention( + query_9, + key_9, + value_9, + attn_mask=attention_mask_10, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_9 = key_9 = value_9 = attention_mask_10 = None + transpose_40 = attn_output_36.transpose(1, 2) + attn_output_36 = None + attn_output_37 = transpose_40.contiguous() + transpose_40 = None + reshape_9 = attn_output_37.reshape(1, 2, -1) + attn_output_37 = None + attn_output_38 = reshape_9.contiguous() + reshape_9 = None + attn_output_39 = torch._C._nn.linear( + attn_output_38, + l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_38 = l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_9 = torch.nn.functional.dropout(attn_output_39, 0.1, False, False) + attn_output_39 = None + hidden_states_46 = torch._C._nn.linear( + hidden_states_45, + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_45 = ( + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_79 = 0.5 * hidden_states_46 + pow_10 = torch.pow(hidden_states_46, 3.0) + mul_80 = 0.044715 * pow_10 + pow_10 = None + add_56 = hidden_states_46 + mul_80 + hidden_states_46 = mul_80 = None + mul_81 = 0.7978845608028654 * add_56 + add_56 = None + tanh_9 = torch.tanh(mul_81) + mul_81 = None + add_57 = 1.0 + tanh_9 + tanh_9 = None + hidden_states_47 = mul_79 * add_57 + mul_79 = add_57 = None + hidden_states_48 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_47 = ( + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_9 = torch.nn.functional.dropout( + hidden_states_48, 0.1, False, False + ) + hidden_states_48 = None + add_58 = attn_outputs_9 + feed_forward_hidden_states_9 + attn_outputs_9 = feed_forward_hidden_states_9 = None + hidden_states_49 = add_58 + hidden_states_44 + add_58 = hidden_states_44 = None + hidden_states_50 = torch.nn.functional.layer_norm( + hidden_states_49, + (2560,), + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_ + ) = None + linear_60 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_31 = linear_60.view((1, 2, -1, 80)) + linear_60 = None + query_states_20 = view_31.transpose(1, 2) + view_31 = None + linear_61 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_32 = linear_61.view((1, 2, -1, 80)) + linear_61 = None + key_states_20 = view_32.transpose(1, 2) + view_32 = None + linear_62 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_33 = linear_62.view((1, 2, -1, 80)) + linear_62 = None + value_states_10 = view_33.transpose(1, 2) + view_33 = None + query_rot_10 = query_states_20[(Ellipsis, slice(None, 32, None))] + query_pass_10 = query_states_20[(Ellipsis, slice(32, None, None))] + query_states_20 = None + key_rot_10 = key_states_20[(Ellipsis, slice(None, 32, None))] + key_pass_10 = key_states_20[(Ellipsis, slice(32, None, None))] + key_states_20 = None + cos_13 = cos_2.unsqueeze(1) + sin_13 = sin_2.unsqueeze(1) + mul_83 = query_rot_10 * cos_13 + x1_20 = query_rot_10[(Ellipsis, slice(None, 16, None))] + x2_20 = query_rot_10[(Ellipsis, slice(16, None, None))] + query_rot_10 = None + neg_20 = -x2_20 + x2_20 = None + cat_41 = torch.cat((neg_20, x1_20), dim=-1) + neg_20 = x1_20 = None + mul_84 = cat_41 * sin_13 + cat_41 = None + q_embed_10 = mul_83 + mul_84 + mul_83 = mul_84 = None + mul_85 = key_rot_10 * cos_13 + cos_13 = None + x1_21 = key_rot_10[(Ellipsis, slice(None, 16, None))] + x2_21 = key_rot_10[(Ellipsis, slice(16, None, None))] + key_rot_10 = None + neg_21 = -x2_21 + x2_21 = None + cat_42 = torch.cat((neg_21, x1_21), dim=-1) + neg_21 = x1_21 = None + mul_86 = cat_42 * sin_13 + cat_42 = sin_13 = None + k_embed_10 = mul_85 + mul_86 + mul_85 = mul_86 = None + query_states_21 = torch.cat((q_embed_10, query_pass_10), dim=-1) + q_embed_10 = query_pass_10 = None + key_states_21 = torch.cat((k_embed_10, key_pass_10), dim=-1) + k_embed_10 = key_pass_10 = None + attention_mask_11 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_10 = query_states_21.contiguous() + query_states_21 = None + key_10 = key_states_21.contiguous() + value_10 = value_states_10.contiguous() + attn_output_40 = torch._C._nn.scaled_dot_product_attention( + query_10, + key_10, + value_10, + attn_mask=attention_mask_11, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_10 = key_10 = value_10 = attention_mask_11 = None + transpose_44 = attn_output_40.transpose(1, 2) + attn_output_40 = None + attn_output_41 = transpose_44.contiguous() + transpose_44 = None + reshape_10 = attn_output_41.reshape(1, 2, -1) + attn_output_41 = None + attn_output_42 = reshape_10.contiguous() + reshape_10 = None + attn_output_43 = torch._C._nn.linear( + attn_output_42, + l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_42 = l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_10 = torch.nn.functional.dropout(attn_output_43, 0.1, False, False) + attn_output_43 = None + hidden_states_51 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_50 = ( + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_87 = 0.5 * hidden_states_51 + pow_11 = torch.pow(hidden_states_51, 3.0) + mul_88 = 0.044715 * pow_11 + pow_11 = None + add_62 = hidden_states_51 + mul_88 + hidden_states_51 = mul_88 = None + mul_89 = 0.7978845608028654 * add_62 + add_62 = None + tanh_10 = torch.tanh(mul_89) + mul_89 = None + add_63 = 1.0 + tanh_10 + tanh_10 = None + hidden_states_52 = mul_87 * add_63 + mul_87 = add_63 = None + hidden_states_53 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_52 = ( + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_10 = torch.nn.functional.dropout( + hidden_states_53, 0.1, False, False + ) + hidden_states_53 = None + add_64 = attn_outputs_10 + feed_forward_hidden_states_10 + attn_outputs_10 = feed_forward_hidden_states_10 = None + hidden_states_54 = add_64 + hidden_states_49 + add_64 = hidden_states_49 = None + hidden_states_55 = torch.nn.functional.layer_norm( + hidden_states_54, + (2560,), + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_ + ) = None + linear_66 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_34 = linear_66.view((1, 2, -1, 80)) + linear_66 = None + query_states_22 = view_34.transpose(1, 2) + view_34 = None + linear_67 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_35 = linear_67.view((1, 2, -1, 80)) + linear_67 = None + key_states_22 = view_35.transpose(1, 2) + view_35 = None + linear_68 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_36 = linear_68.view((1, 2, -1, 80)) + linear_68 = None + value_states_11 = view_36.transpose(1, 2) + view_36 = None + query_rot_11 = query_states_22[(Ellipsis, slice(None, 32, None))] + query_pass_11 = query_states_22[(Ellipsis, slice(32, None, None))] + query_states_22 = None + key_rot_11 = key_states_22[(Ellipsis, slice(None, 32, None))] + key_pass_11 = key_states_22[(Ellipsis, slice(32, None, None))] + key_states_22 = None + cos_14 = cos_2.unsqueeze(1) + sin_14 = sin_2.unsqueeze(1) + mul_91 = query_rot_11 * cos_14 + x1_22 = query_rot_11[(Ellipsis, slice(None, 16, None))] + x2_22 = query_rot_11[(Ellipsis, slice(16, None, None))] + query_rot_11 = None + neg_22 = -x2_22 + x2_22 = None + cat_45 = torch.cat((neg_22, x1_22), dim=-1) + neg_22 = x1_22 = None + mul_92 = cat_45 * sin_14 + cat_45 = None + q_embed_11 = mul_91 + mul_92 + mul_91 = mul_92 = None + mul_93 = key_rot_11 * cos_14 + cos_14 = None + x1_23 = key_rot_11[(Ellipsis, slice(None, 16, None))] + x2_23 = key_rot_11[(Ellipsis, slice(16, None, None))] + key_rot_11 = None + neg_23 = -x2_23 + x2_23 = None + cat_46 = torch.cat((neg_23, x1_23), dim=-1) + neg_23 = x1_23 = None + mul_94 = cat_46 * sin_14 + cat_46 = sin_14 = None + k_embed_11 = mul_93 + mul_94 + mul_93 = mul_94 = None + query_states_23 = torch.cat((q_embed_11, query_pass_11), dim=-1) + q_embed_11 = query_pass_11 = None + key_states_23 = torch.cat((k_embed_11, key_pass_11), dim=-1) + k_embed_11 = key_pass_11 = None + attention_mask_12 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_11 = query_states_23.contiguous() + query_states_23 = None + key_11 = key_states_23.contiguous() + value_11 = value_states_11.contiguous() + attn_output_44 = torch._C._nn.scaled_dot_product_attention( + query_11, + key_11, + value_11, + attn_mask=attention_mask_12, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_11 = key_11 = value_11 = attention_mask_12 = None + transpose_48 = attn_output_44.transpose(1, 2) + attn_output_44 = None + attn_output_45 = transpose_48.contiguous() + transpose_48 = None + reshape_11 = attn_output_45.reshape(1, 2, -1) + attn_output_45 = None + attn_output_46 = reshape_11.contiguous() + reshape_11 = None + attn_output_47 = torch._C._nn.linear( + attn_output_46, + l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_46 = l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_11 = torch.nn.functional.dropout(attn_output_47, 0.1, False, False) + attn_output_47 = None + hidden_states_56 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_55 = ( + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_95 = 0.5 * hidden_states_56 + pow_12 = torch.pow(hidden_states_56, 3.0) + mul_96 = 0.044715 * pow_12 + pow_12 = None + add_68 = hidden_states_56 + mul_96 + hidden_states_56 = mul_96 = None + mul_97 = 0.7978845608028654 * add_68 + add_68 = None + tanh_11 = torch.tanh(mul_97) + mul_97 = None + add_69 = 1.0 + tanh_11 + tanh_11 = None + hidden_states_57 = mul_95 * add_69 + mul_95 = add_69 = None + hidden_states_58 = torch._C._nn.linear( + hidden_states_57, + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_57 = ( + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_11 = torch.nn.functional.dropout( + hidden_states_58, 0.1, False, False + ) + hidden_states_58 = None + add_70 = attn_outputs_11 + feed_forward_hidden_states_11 + attn_outputs_11 = feed_forward_hidden_states_11 = None + hidden_states_59 = add_70 + hidden_states_54 + add_70 = hidden_states_54 = None + hidden_states_60 = torch.nn.functional.layer_norm( + hidden_states_59, + (2560,), + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_ + ) = None + linear_72 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_37 = linear_72.view((1, 2, -1, 80)) + linear_72 = None + query_states_24 = view_37.transpose(1, 2) + view_37 = None + linear_73 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_38 = linear_73.view((1, 2, -1, 80)) + linear_73 = None + key_states_24 = view_38.transpose(1, 2) + view_38 = None + linear_74 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_39 = linear_74.view((1, 2, -1, 80)) + linear_74 = None + value_states_12 = view_39.transpose(1, 2) + view_39 = None + query_rot_12 = query_states_24[(Ellipsis, slice(None, 32, None))] + query_pass_12 = query_states_24[(Ellipsis, slice(32, None, None))] + query_states_24 = None + key_rot_12 = key_states_24[(Ellipsis, slice(None, 32, None))] + key_pass_12 = key_states_24[(Ellipsis, slice(32, None, None))] + key_states_24 = None + cos_15 = cos_2.unsqueeze(1) + sin_15 = sin_2.unsqueeze(1) + mul_99 = query_rot_12 * cos_15 + x1_24 = query_rot_12[(Ellipsis, slice(None, 16, None))] + x2_24 = query_rot_12[(Ellipsis, slice(16, None, None))] + query_rot_12 = None + neg_24 = -x2_24 + x2_24 = None + cat_49 = torch.cat((neg_24, x1_24), dim=-1) + neg_24 = x1_24 = None + mul_100 = cat_49 * sin_15 + cat_49 = None + q_embed_12 = mul_99 + mul_100 + mul_99 = mul_100 = None + mul_101 = key_rot_12 * cos_15 + cos_15 = None + x1_25 = key_rot_12[(Ellipsis, slice(None, 16, None))] + x2_25 = key_rot_12[(Ellipsis, slice(16, None, None))] + key_rot_12 = None + neg_25 = -x2_25 + x2_25 = None + cat_50 = torch.cat((neg_25, x1_25), dim=-1) + neg_25 = x1_25 = None + mul_102 = cat_50 * sin_15 + cat_50 = sin_15 = None + k_embed_12 = mul_101 + mul_102 + mul_101 = mul_102 = None + query_states_25 = torch.cat((q_embed_12, query_pass_12), dim=-1) + q_embed_12 = query_pass_12 = None + key_states_25 = torch.cat((k_embed_12, key_pass_12), dim=-1) + k_embed_12 = key_pass_12 = None + attention_mask_13 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_12 = query_states_25.contiguous() + query_states_25 = None + key_12 = key_states_25.contiguous() + value_12 = value_states_12.contiguous() + attn_output_48 = torch._C._nn.scaled_dot_product_attention( + query_12, + key_12, + value_12, + attn_mask=attention_mask_13, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_12 = key_12 = value_12 = attention_mask_13 = None + transpose_52 = attn_output_48.transpose(1, 2) + attn_output_48 = None + attn_output_49 = transpose_52.contiguous() + transpose_52 = None + reshape_12 = attn_output_49.reshape(1, 2, -1) + attn_output_49 = None + attn_output_50 = reshape_12.contiguous() + reshape_12 = None + attn_output_51 = torch._C._nn.linear( + attn_output_50, + l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_50 = l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_12 = torch.nn.functional.dropout(attn_output_51, 0.1, False, False) + attn_output_51 = None + hidden_states_61 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_60 = ( + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_103 = 0.5 * hidden_states_61 + pow_13 = torch.pow(hidden_states_61, 3.0) + mul_104 = 0.044715 * pow_13 + pow_13 = None + add_74 = hidden_states_61 + mul_104 + hidden_states_61 = mul_104 = None + mul_105 = 0.7978845608028654 * add_74 + add_74 = None + tanh_12 = torch.tanh(mul_105) + mul_105 = None + add_75 = 1.0 + tanh_12 + tanh_12 = None + hidden_states_62 = mul_103 * add_75 + mul_103 = add_75 = None + hidden_states_63 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_62 = ( + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_12 = torch.nn.functional.dropout( + hidden_states_63, 0.1, False, False + ) + hidden_states_63 = None + add_76 = attn_outputs_12 + feed_forward_hidden_states_12 + attn_outputs_12 = feed_forward_hidden_states_12 = None + hidden_states_64 = add_76 + hidden_states_59 + add_76 = hidden_states_59 = None + hidden_states_65 = torch.nn.functional.layer_norm( + hidden_states_64, + (2560,), + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_ + ) = None + linear_78 = torch._C._nn.linear( + hidden_states_65, + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_40 = linear_78.view((1, 2, -1, 80)) + linear_78 = None + query_states_26 = view_40.transpose(1, 2) + view_40 = None + linear_79 = torch._C._nn.linear( + hidden_states_65, + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_41 = linear_79.view((1, 2, -1, 80)) + linear_79 = None + key_states_26 = view_41.transpose(1, 2) + view_41 = None + linear_80 = torch._C._nn.linear( + hidden_states_65, + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_42 = linear_80.view((1, 2, -1, 80)) + linear_80 = None + value_states_13 = view_42.transpose(1, 2) + view_42 = None + query_rot_13 = query_states_26[(Ellipsis, slice(None, 32, None))] + query_pass_13 = query_states_26[(Ellipsis, slice(32, None, None))] + query_states_26 = None + key_rot_13 = key_states_26[(Ellipsis, slice(None, 32, None))] + key_pass_13 = key_states_26[(Ellipsis, slice(32, None, None))] + key_states_26 = None + cos_16 = cos_2.unsqueeze(1) + sin_16 = sin_2.unsqueeze(1) + mul_107 = query_rot_13 * cos_16 + x1_26 = query_rot_13[(Ellipsis, slice(None, 16, None))] + x2_26 = query_rot_13[(Ellipsis, slice(16, None, None))] + query_rot_13 = None + neg_26 = -x2_26 + x2_26 = None + cat_53 = torch.cat((neg_26, x1_26), dim=-1) + neg_26 = x1_26 = None + mul_108 = cat_53 * sin_16 + cat_53 = None + q_embed_13 = mul_107 + mul_108 + mul_107 = mul_108 = None + mul_109 = key_rot_13 * cos_16 + cos_16 = None + x1_27 = key_rot_13[(Ellipsis, slice(None, 16, None))] + x2_27 = key_rot_13[(Ellipsis, slice(16, None, None))] + key_rot_13 = None + neg_27 = -x2_27 + x2_27 = None + cat_54 = torch.cat((neg_27, x1_27), dim=-1) + neg_27 = x1_27 = None + mul_110 = cat_54 * sin_16 + cat_54 = sin_16 = None + k_embed_13 = mul_109 + mul_110 + mul_109 = mul_110 = None + query_states_27 = torch.cat((q_embed_13, query_pass_13), dim=-1) + q_embed_13 = query_pass_13 = None + key_states_27 = torch.cat((k_embed_13, key_pass_13), dim=-1) + k_embed_13 = key_pass_13 = None + attention_mask_14 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_13 = query_states_27.contiguous() + query_states_27 = None + key_13 = key_states_27.contiguous() + value_13 = value_states_13.contiguous() + attn_output_52 = torch._C._nn.scaled_dot_product_attention( + query_13, + key_13, + value_13, + attn_mask=attention_mask_14, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_13 = key_13 = value_13 = attention_mask_14 = None + transpose_56 = attn_output_52.transpose(1, 2) + attn_output_52 = None + attn_output_53 = transpose_56.contiguous() + transpose_56 = None + reshape_13 = attn_output_53.reshape(1, 2, -1) + attn_output_53 = None + attn_output_54 = reshape_13.contiguous() + reshape_13 = None + attn_output_55 = torch._C._nn.linear( + attn_output_54, + l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_54 = l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_13 = torch.nn.functional.dropout(attn_output_55, 0.1, False, False) + attn_output_55 = None + hidden_states_66 = torch._C._nn.linear( + hidden_states_65, + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_65 = ( + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_111 = 0.5 * hidden_states_66 + pow_14 = torch.pow(hidden_states_66, 3.0) + mul_112 = 0.044715 * pow_14 + pow_14 = None + add_80 = hidden_states_66 + mul_112 + hidden_states_66 = mul_112 = None + mul_113 = 0.7978845608028654 * add_80 + add_80 = None + tanh_13 = torch.tanh(mul_113) + mul_113 = None + add_81 = 1.0 + tanh_13 + tanh_13 = None + hidden_states_67 = mul_111 * add_81 + mul_111 = add_81 = None + hidden_states_68 = torch._C._nn.linear( + hidden_states_67, + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_67 = ( + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_13 = torch.nn.functional.dropout( + hidden_states_68, 0.1, False, False + ) + hidden_states_68 = None + add_82 = attn_outputs_13 + feed_forward_hidden_states_13 + attn_outputs_13 = feed_forward_hidden_states_13 = None + hidden_states_69 = add_82 + hidden_states_64 + add_82 = hidden_states_64 = None + hidden_states_70 = torch.nn.functional.layer_norm( + hidden_states_69, + (2560,), + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_ + ) = None + linear_84 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_43 = linear_84.view((1, 2, -1, 80)) + linear_84 = None + query_states_28 = view_43.transpose(1, 2) + view_43 = None + linear_85 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_44 = linear_85.view((1, 2, -1, 80)) + linear_85 = None + key_states_28 = view_44.transpose(1, 2) + view_44 = None + linear_86 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_45 = linear_86.view((1, 2, -1, 80)) + linear_86 = None + value_states_14 = view_45.transpose(1, 2) + view_45 = None + query_rot_14 = query_states_28[(Ellipsis, slice(None, 32, None))] + query_pass_14 = query_states_28[(Ellipsis, slice(32, None, None))] + query_states_28 = None + key_rot_14 = key_states_28[(Ellipsis, slice(None, 32, None))] + key_pass_14 = key_states_28[(Ellipsis, slice(32, None, None))] + key_states_28 = None + cos_17 = cos_2.unsqueeze(1) + sin_17 = sin_2.unsqueeze(1) + mul_115 = query_rot_14 * cos_17 + x1_28 = query_rot_14[(Ellipsis, slice(None, 16, None))] + x2_28 = query_rot_14[(Ellipsis, slice(16, None, None))] + query_rot_14 = None + neg_28 = -x2_28 + x2_28 = None + cat_57 = torch.cat((neg_28, x1_28), dim=-1) + neg_28 = x1_28 = None + mul_116 = cat_57 * sin_17 + cat_57 = None + q_embed_14 = mul_115 + mul_116 + mul_115 = mul_116 = None + mul_117 = key_rot_14 * cos_17 + cos_17 = None + x1_29 = key_rot_14[(Ellipsis, slice(None, 16, None))] + x2_29 = key_rot_14[(Ellipsis, slice(16, None, None))] + key_rot_14 = None + neg_29 = -x2_29 + x2_29 = None + cat_58 = torch.cat((neg_29, x1_29), dim=-1) + neg_29 = x1_29 = None + mul_118 = cat_58 * sin_17 + cat_58 = sin_17 = None + k_embed_14 = mul_117 + mul_118 + mul_117 = mul_118 = None + query_states_29 = torch.cat((q_embed_14, query_pass_14), dim=-1) + q_embed_14 = query_pass_14 = None + key_states_29 = torch.cat((k_embed_14, key_pass_14), dim=-1) + k_embed_14 = key_pass_14 = None + attention_mask_15 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_14 = query_states_29.contiguous() + query_states_29 = None + key_14 = key_states_29.contiguous() + value_14 = value_states_14.contiguous() + attn_output_56 = torch._C._nn.scaled_dot_product_attention( + query_14, + key_14, + value_14, + attn_mask=attention_mask_15, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_14 = key_14 = value_14 = attention_mask_15 = None + transpose_60 = attn_output_56.transpose(1, 2) + attn_output_56 = None + attn_output_57 = transpose_60.contiguous() + transpose_60 = None + reshape_14 = attn_output_57.reshape(1, 2, -1) + attn_output_57 = None + attn_output_58 = reshape_14.contiguous() + reshape_14 = None + attn_output_59 = torch._C._nn.linear( + attn_output_58, + l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_58 = l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_14 = torch.nn.functional.dropout(attn_output_59, 0.1, False, False) + attn_output_59 = None + hidden_states_71 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_70 = ( + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_119 = 0.5 * hidden_states_71 + pow_15 = torch.pow(hidden_states_71, 3.0) + mul_120 = 0.044715 * pow_15 + pow_15 = None + add_86 = hidden_states_71 + mul_120 + hidden_states_71 = mul_120 = None + mul_121 = 0.7978845608028654 * add_86 + add_86 = None + tanh_14 = torch.tanh(mul_121) + mul_121 = None + add_87 = 1.0 + tanh_14 + tanh_14 = None + hidden_states_72 = mul_119 * add_87 + mul_119 = add_87 = None + hidden_states_73 = torch._C._nn.linear( + hidden_states_72, + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_72 = ( + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_14 = torch.nn.functional.dropout( + hidden_states_73, 0.1, False, False + ) + hidden_states_73 = None + add_88 = attn_outputs_14 + feed_forward_hidden_states_14 + attn_outputs_14 = feed_forward_hidden_states_14 = None + hidden_states_74 = add_88 + hidden_states_69 + add_88 = hidden_states_69 = None + hidden_states_75 = torch.nn.functional.layer_norm( + hidden_states_74, + (2560,), + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_ + ) = None + linear_90 = torch._C._nn.linear( + hidden_states_75, + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_46 = linear_90.view((1, 2, -1, 80)) + linear_90 = None + query_states_30 = view_46.transpose(1, 2) + view_46 = None + linear_91 = torch._C._nn.linear( + hidden_states_75, + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_47 = linear_91.view((1, 2, -1, 80)) + linear_91 = None + key_states_30 = view_47.transpose(1, 2) + view_47 = None + linear_92 = torch._C._nn.linear( + hidden_states_75, + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_48 = linear_92.view((1, 2, -1, 80)) + linear_92 = None + value_states_15 = view_48.transpose(1, 2) + view_48 = None + query_rot_15 = query_states_30[(Ellipsis, slice(None, 32, None))] + query_pass_15 = query_states_30[(Ellipsis, slice(32, None, None))] + query_states_30 = None + key_rot_15 = key_states_30[(Ellipsis, slice(None, 32, None))] + key_pass_15 = key_states_30[(Ellipsis, slice(32, None, None))] + key_states_30 = None + cos_18 = cos_2.unsqueeze(1) + sin_18 = sin_2.unsqueeze(1) + mul_123 = query_rot_15 * cos_18 + x1_30 = query_rot_15[(Ellipsis, slice(None, 16, None))] + x2_30 = query_rot_15[(Ellipsis, slice(16, None, None))] + query_rot_15 = None + neg_30 = -x2_30 + x2_30 = None + cat_61 = torch.cat((neg_30, x1_30), dim=-1) + neg_30 = x1_30 = None + mul_124 = cat_61 * sin_18 + cat_61 = None + q_embed_15 = mul_123 + mul_124 + mul_123 = mul_124 = None + mul_125 = key_rot_15 * cos_18 + cos_18 = None + x1_31 = key_rot_15[(Ellipsis, slice(None, 16, None))] + x2_31 = key_rot_15[(Ellipsis, slice(16, None, None))] + key_rot_15 = None + neg_31 = -x2_31 + x2_31 = None + cat_62 = torch.cat((neg_31, x1_31), dim=-1) + neg_31 = x1_31 = None + mul_126 = cat_62 * sin_18 + cat_62 = sin_18 = None + k_embed_15 = mul_125 + mul_126 + mul_125 = mul_126 = None + query_states_31 = torch.cat((q_embed_15, query_pass_15), dim=-1) + q_embed_15 = query_pass_15 = None + key_states_31 = torch.cat((k_embed_15, key_pass_15), dim=-1) + k_embed_15 = key_pass_15 = None + attention_mask_16 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_15 = query_states_31.contiguous() + query_states_31 = None + key_15 = key_states_31.contiguous() + value_15 = value_states_15.contiguous() + attn_output_60 = torch._C._nn.scaled_dot_product_attention( + query_15, + key_15, + value_15, + attn_mask=attention_mask_16, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_15 = key_15 = value_15 = attention_mask_16 = None + transpose_64 = attn_output_60.transpose(1, 2) + attn_output_60 = None + attn_output_61 = transpose_64.contiguous() + transpose_64 = None + reshape_15 = attn_output_61.reshape(1, 2, -1) + attn_output_61 = None + attn_output_62 = reshape_15.contiguous() + reshape_15 = None + attn_output_63 = torch._C._nn.linear( + attn_output_62, + l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_62 = l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_15 = torch.nn.functional.dropout(attn_output_63, 0.1, False, False) + attn_output_63 = None + hidden_states_76 = torch._C._nn.linear( + hidden_states_75, + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_75 = ( + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_127 = 0.5 * hidden_states_76 + pow_16 = torch.pow(hidden_states_76, 3.0) + mul_128 = 0.044715 * pow_16 + pow_16 = None + add_92 = hidden_states_76 + mul_128 + hidden_states_76 = mul_128 = None + mul_129 = 0.7978845608028654 * add_92 + add_92 = None + tanh_15 = torch.tanh(mul_129) + mul_129 = None + add_93 = 1.0 + tanh_15 + tanh_15 = None + hidden_states_77 = mul_127 * add_93 + mul_127 = add_93 = None + hidden_states_78 = torch._C._nn.linear( + hidden_states_77, + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_77 = ( + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_15 = torch.nn.functional.dropout( + hidden_states_78, 0.1, False, False + ) + hidden_states_78 = None + add_94 = attn_outputs_15 + feed_forward_hidden_states_15 + attn_outputs_15 = feed_forward_hidden_states_15 = None + hidden_states_79 = add_94 + hidden_states_74 + add_94 = hidden_states_74 = None + hidden_states_80 = torch.nn.functional.layer_norm( + hidden_states_79, + (2560,), + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_ + ) = None + linear_96 = torch._C._nn.linear( + hidden_states_80, + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_49 = linear_96.view((1, 2, -1, 80)) + linear_96 = None + query_states_32 = view_49.transpose(1, 2) + view_49 = None + linear_97 = torch._C._nn.linear( + hidden_states_80, + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_50 = linear_97.view((1, 2, -1, 80)) + linear_97 = None + key_states_32 = view_50.transpose(1, 2) + view_50 = None + linear_98 = torch._C._nn.linear( + hidden_states_80, + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_51 = linear_98.view((1, 2, -1, 80)) + linear_98 = None + value_states_16 = view_51.transpose(1, 2) + view_51 = None + query_rot_16 = query_states_32[(Ellipsis, slice(None, 32, None))] + query_pass_16 = query_states_32[(Ellipsis, slice(32, None, None))] + query_states_32 = None + key_rot_16 = key_states_32[(Ellipsis, slice(None, 32, None))] + key_pass_16 = key_states_32[(Ellipsis, slice(32, None, None))] + key_states_32 = None + cos_19 = cos_2.unsqueeze(1) + sin_19 = sin_2.unsqueeze(1) + mul_131 = query_rot_16 * cos_19 + x1_32 = query_rot_16[(Ellipsis, slice(None, 16, None))] + x2_32 = query_rot_16[(Ellipsis, slice(16, None, None))] + query_rot_16 = None + neg_32 = -x2_32 + x2_32 = None + cat_65 = torch.cat((neg_32, x1_32), dim=-1) + neg_32 = x1_32 = None + mul_132 = cat_65 * sin_19 + cat_65 = None + q_embed_16 = mul_131 + mul_132 + mul_131 = mul_132 = None + mul_133 = key_rot_16 * cos_19 + cos_19 = None + x1_33 = key_rot_16[(Ellipsis, slice(None, 16, None))] + x2_33 = key_rot_16[(Ellipsis, slice(16, None, None))] + key_rot_16 = None + neg_33 = -x2_33 + x2_33 = None + cat_66 = torch.cat((neg_33, x1_33), dim=-1) + neg_33 = x1_33 = None + mul_134 = cat_66 * sin_19 + cat_66 = sin_19 = None + k_embed_16 = mul_133 + mul_134 + mul_133 = mul_134 = None + query_states_33 = torch.cat((q_embed_16, query_pass_16), dim=-1) + q_embed_16 = query_pass_16 = None + key_states_33 = torch.cat((k_embed_16, key_pass_16), dim=-1) + k_embed_16 = key_pass_16 = None + attention_mask_17 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_16 = query_states_33.contiguous() + query_states_33 = None + key_16 = key_states_33.contiguous() + value_16 = value_states_16.contiguous() + attn_output_64 = torch._C._nn.scaled_dot_product_attention( + query_16, + key_16, + value_16, + attn_mask=attention_mask_17, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_16 = key_16 = value_16 = attention_mask_17 = None + transpose_68 = attn_output_64.transpose(1, 2) + attn_output_64 = None + attn_output_65 = transpose_68.contiguous() + transpose_68 = None + reshape_16 = attn_output_65.reshape(1, 2, -1) + attn_output_65 = None + attn_output_66 = reshape_16.contiguous() + reshape_16 = None + attn_output_67 = torch._C._nn.linear( + attn_output_66, + l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_66 = l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_16 = torch.nn.functional.dropout(attn_output_67, 0.1, False, False) + attn_output_67 = None + hidden_states_81 = torch._C._nn.linear( + hidden_states_80, + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_80 = ( + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_135 = 0.5 * hidden_states_81 + pow_17 = torch.pow(hidden_states_81, 3.0) + mul_136 = 0.044715 * pow_17 + pow_17 = None + add_98 = hidden_states_81 + mul_136 + hidden_states_81 = mul_136 = None + mul_137 = 0.7978845608028654 * add_98 + add_98 = None + tanh_16 = torch.tanh(mul_137) + mul_137 = None + add_99 = 1.0 + tanh_16 + tanh_16 = None + hidden_states_82 = mul_135 * add_99 + mul_135 = add_99 = None + hidden_states_83 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_82 = ( + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_16 = torch.nn.functional.dropout( + hidden_states_83, 0.1, False, False + ) + hidden_states_83 = None + add_100 = attn_outputs_16 + feed_forward_hidden_states_16 + attn_outputs_16 = feed_forward_hidden_states_16 = None + hidden_states_84 = add_100 + hidden_states_79 + add_100 = hidden_states_79 = None + hidden_states_85 = torch.nn.functional.layer_norm( + hidden_states_84, + (2560,), + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_ + ) = None + linear_102 = torch._C._nn.linear( + hidden_states_85, + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_52 = linear_102.view((1, 2, -1, 80)) + linear_102 = None + query_states_34 = view_52.transpose(1, 2) + view_52 = None + linear_103 = torch._C._nn.linear( + hidden_states_85, + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_53 = linear_103.view((1, 2, -1, 80)) + linear_103 = None + key_states_34 = view_53.transpose(1, 2) + view_53 = None + linear_104 = torch._C._nn.linear( + hidden_states_85, + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_54 = linear_104.view((1, 2, -1, 80)) + linear_104 = None + value_states_17 = view_54.transpose(1, 2) + view_54 = None + query_rot_17 = query_states_34[(Ellipsis, slice(None, 32, None))] + query_pass_17 = query_states_34[(Ellipsis, slice(32, None, None))] + query_states_34 = None + key_rot_17 = key_states_34[(Ellipsis, slice(None, 32, None))] + key_pass_17 = key_states_34[(Ellipsis, slice(32, None, None))] + key_states_34 = None + cos_20 = cos_2.unsqueeze(1) + sin_20 = sin_2.unsqueeze(1) + mul_139 = query_rot_17 * cos_20 + x1_34 = query_rot_17[(Ellipsis, slice(None, 16, None))] + x2_34 = query_rot_17[(Ellipsis, slice(16, None, None))] + query_rot_17 = None + neg_34 = -x2_34 + x2_34 = None + cat_69 = torch.cat((neg_34, x1_34), dim=-1) + neg_34 = x1_34 = None + mul_140 = cat_69 * sin_20 + cat_69 = None + q_embed_17 = mul_139 + mul_140 + mul_139 = mul_140 = None + mul_141 = key_rot_17 * cos_20 + cos_20 = None + x1_35 = key_rot_17[(Ellipsis, slice(None, 16, None))] + x2_35 = key_rot_17[(Ellipsis, slice(16, None, None))] + key_rot_17 = None + neg_35 = -x2_35 + x2_35 = None + cat_70 = torch.cat((neg_35, x1_35), dim=-1) + neg_35 = x1_35 = None + mul_142 = cat_70 * sin_20 + cat_70 = sin_20 = None + k_embed_17 = mul_141 + mul_142 + mul_141 = mul_142 = None + query_states_35 = torch.cat((q_embed_17, query_pass_17), dim=-1) + q_embed_17 = query_pass_17 = None + key_states_35 = torch.cat((k_embed_17, key_pass_17), dim=-1) + k_embed_17 = key_pass_17 = None + attention_mask_18 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_17 = query_states_35.contiguous() + query_states_35 = None + key_17 = key_states_35.contiguous() + value_17 = value_states_17.contiguous() + attn_output_68 = torch._C._nn.scaled_dot_product_attention( + query_17, + key_17, + value_17, + attn_mask=attention_mask_18, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_17 = key_17 = value_17 = attention_mask_18 = None + transpose_72 = attn_output_68.transpose(1, 2) + attn_output_68 = None + attn_output_69 = transpose_72.contiguous() + transpose_72 = None + reshape_17 = attn_output_69.reshape(1, 2, -1) + attn_output_69 = None + attn_output_70 = reshape_17.contiguous() + reshape_17 = None + attn_output_71 = torch._C._nn.linear( + attn_output_70, + l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_70 = l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_17 = torch.nn.functional.dropout(attn_output_71, 0.1, False, False) + attn_output_71 = None + hidden_states_86 = torch._C._nn.linear( + hidden_states_85, + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_85 = ( + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_143 = 0.5 * hidden_states_86 + pow_18 = torch.pow(hidden_states_86, 3.0) + mul_144 = 0.044715 * pow_18 + pow_18 = None + add_104 = hidden_states_86 + mul_144 + hidden_states_86 = mul_144 = None + mul_145 = 0.7978845608028654 * add_104 + add_104 = None + tanh_17 = torch.tanh(mul_145) + mul_145 = None + add_105 = 1.0 + tanh_17 + tanh_17 = None + hidden_states_87 = mul_143 * add_105 + mul_143 = add_105 = None + hidden_states_88 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_87 = ( + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_17 = torch.nn.functional.dropout( + hidden_states_88, 0.1, False, False + ) + hidden_states_88 = None + add_106 = attn_outputs_17 + feed_forward_hidden_states_17 + attn_outputs_17 = feed_forward_hidden_states_17 = None + hidden_states_89 = add_106 + hidden_states_84 + add_106 = hidden_states_84 = None + hidden_states_90 = torch.nn.functional.layer_norm( + hidden_states_89, + (2560,), + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_ + ) = None + linear_108 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_55 = linear_108.view((1, 2, -1, 80)) + linear_108 = None + query_states_36 = view_55.transpose(1, 2) + view_55 = None + linear_109 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_56 = linear_109.view((1, 2, -1, 80)) + linear_109 = None + key_states_36 = view_56.transpose(1, 2) + view_56 = None + linear_110 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_57 = linear_110.view((1, 2, -1, 80)) + linear_110 = None + value_states_18 = view_57.transpose(1, 2) + view_57 = None + query_rot_18 = query_states_36[(Ellipsis, slice(None, 32, None))] + query_pass_18 = query_states_36[(Ellipsis, slice(32, None, None))] + query_states_36 = None + key_rot_18 = key_states_36[(Ellipsis, slice(None, 32, None))] + key_pass_18 = key_states_36[(Ellipsis, slice(32, None, None))] + key_states_36 = None + cos_21 = cos_2.unsqueeze(1) + sin_21 = sin_2.unsqueeze(1) + mul_147 = query_rot_18 * cos_21 + x1_36 = query_rot_18[(Ellipsis, slice(None, 16, None))] + x2_36 = query_rot_18[(Ellipsis, slice(16, None, None))] + query_rot_18 = None + neg_36 = -x2_36 + x2_36 = None + cat_73 = torch.cat((neg_36, x1_36), dim=-1) + neg_36 = x1_36 = None + mul_148 = cat_73 * sin_21 + cat_73 = None + q_embed_18 = mul_147 + mul_148 + mul_147 = mul_148 = None + mul_149 = key_rot_18 * cos_21 + cos_21 = None + x1_37 = key_rot_18[(Ellipsis, slice(None, 16, None))] + x2_37 = key_rot_18[(Ellipsis, slice(16, None, None))] + key_rot_18 = None + neg_37 = -x2_37 + x2_37 = None + cat_74 = torch.cat((neg_37, x1_37), dim=-1) + neg_37 = x1_37 = None + mul_150 = cat_74 * sin_21 + cat_74 = sin_21 = None + k_embed_18 = mul_149 + mul_150 + mul_149 = mul_150 = None + query_states_37 = torch.cat((q_embed_18, query_pass_18), dim=-1) + q_embed_18 = query_pass_18 = None + key_states_37 = torch.cat((k_embed_18, key_pass_18), dim=-1) + k_embed_18 = key_pass_18 = None + attention_mask_19 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_18 = query_states_37.contiguous() + query_states_37 = None + key_18 = key_states_37.contiguous() + value_18 = value_states_18.contiguous() + attn_output_72 = torch._C._nn.scaled_dot_product_attention( + query_18, + key_18, + value_18, + attn_mask=attention_mask_19, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_18 = key_18 = value_18 = attention_mask_19 = None + transpose_76 = attn_output_72.transpose(1, 2) + attn_output_72 = None + attn_output_73 = transpose_76.contiguous() + transpose_76 = None + reshape_18 = attn_output_73.reshape(1, 2, -1) + attn_output_73 = None + attn_output_74 = reshape_18.contiguous() + reshape_18 = None + attn_output_75 = torch._C._nn.linear( + attn_output_74, + l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_74 = l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_18 = torch.nn.functional.dropout(attn_output_75, 0.1, False, False) + attn_output_75 = None + hidden_states_91 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_90 = ( + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_151 = 0.5 * hidden_states_91 + pow_19 = torch.pow(hidden_states_91, 3.0) + mul_152 = 0.044715 * pow_19 + pow_19 = None + add_110 = hidden_states_91 + mul_152 + hidden_states_91 = mul_152 = None + mul_153 = 0.7978845608028654 * add_110 + add_110 = None + tanh_18 = torch.tanh(mul_153) + mul_153 = None + add_111 = 1.0 + tanh_18 + tanh_18 = None + hidden_states_92 = mul_151 * add_111 + mul_151 = add_111 = None + hidden_states_93 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_92 = ( + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_18 = torch.nn.functional.dropout( + hidden_states_93, 0.1, False, False + ) + hidden_states_93 = None + add_112 = attn_outputs_18 + feed_forward_hidden_states_18 + attn_outputs_18 = feed_forward_hidden_states_18 = None + hidden_states_94 = add_112 + hidden_states_89 + add_112 = hidden_states_89 = None + hidden_states_95 = torch.nn.functional.layer_norm( + hidden_states_94, + (2560,), + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_ + ) = None + linear_114 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_58 = linear_114.view((1, 2, -1, 80)) + linear_114 = None + query_states_38 = view_58.transpose(1, 2) + view_58 = None + linear_115 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_59 = linear_115.view((1, 2, -1, 80)) + linear_115 = None + key_states_38 = view_59.transpose(1, 2) + view_59 = None + linear_116 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_60 = linear_116.view((1, 2, -1, 80)) + linear_116 = None + value_states_19 = view_60.transpose(1, 2) + view_60 = None + query_rot_19 = query_states_38[(Ellipsis, slice(None, 32, None))] + query_pass_19 = query_states_38[(Ellipsis, slice(32, None, None))] + query_states_38 = None + key_rot_19 = key_states_38[(Ellipsis, slice(None, 32, None))] + key_pass_19 = key_states_38[(Ellipsis, slice(32, None, None))] + key_states_38 = None + cos_22 = cos_2.unsqueeze(1) + sin_22 = sin_2.unsqueeze(1) + mul_155 = query_rot_19 * cos_22 + x1_38 = query_rot_19[(Ellipsis, slice(None, 16, None))] + x2_38 = query_rot_19[(Ellipsis, slice(16, None, None))] + query_rot_19 = None + neg_38 = -x2_38 + x2_38 = None + cat_77 = torch.cat((neg_38, x1_38), dim=-1) + neg_38 = x1_38 = None + mul_156 = cat_77 * sin_22 + cat_77 = None + q_embed_19 = mul_155 + mul_156 + mul_155 = mul_156 = None + mul_157 = key_rot_19 * cos_22 + cos_22 = None + x1_39 = key_rot_19[(Ellipsis, slice(None, 16, None))] + x2_39 = key_rot_19[(Ellipsis, slice(16, None, None))] + key_rot_19 = None + neg_39 = -x2_39 + x2_39 = None + cat_78 = torch.cat((neg_39, x1_39), dim=-1) + neg_39 = x1_39 = None + mul_158 = cat_78 * sin_22 + cat_78 = sin_22 = None + k_embed_19 = mul_157 + mul_158 + mul_157 = mul_158 = None + query_states_39 = torch.cat((q_embed_19, query_pass_19), dim=-1) + q_embed_19 = query_pass_19 = None + key_states_39 = torch.cat((k_embed_19, key_pass_19), dim=-1) + k_embed_19 = key_pass_19 = None + attention_mask_20 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_19 = query_states_39.contiguous() + query_states_39 = None + key_19 = key_states_39.contiguous() + value_19 = value_states_19.contiguous() + attn_output_76 = torch._C._nn.scaled_dot_product_attention( + query_19, + key_19, + value_19, + attn_mask=attention_mask_20, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_19 = key_19 = value_19 = attention_mask_20 = None + transpose_80 = attn_output_76.transpose(1, 2) + attn_output_76 = None + attn_output_77 = transpose_80.contiguous() + transpose_80 = None + reshape_19 = attn_output_77.reshape(1, 2, -1) + attn_output_77 = None + attn_output_78 = reshape_19.contiguous() + reshape_19 = None + attn_output_79 = torch._C._nn.linear( + attn_output_78, + l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_78 = l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_19 = torch.nn.functional.dropout(attn_output_79, 0.1, False, False) + attn_output_79 = None + hidden_states_96 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_95 = ( + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_159 = 0.5 * hidden_states_96 + pow_20 = torch.pow(hidden_states_96, 3.0) + mul_160 = 0.044715 * pow_20 + pow_20 = None + add_116 = hidden_states_96 + mul_160 + hidden_states_96 = mul_160 = None + mul_161 = 0.7978845608028654 * add_116 + add_116 = None + tanh_19 = torch.tanh(mul_161) + mul_161 = None + add_117 = 1.0 + tanh_19 + tanh_19 = None + hidden_states_97 = mul_159 * add_117 + mul_159 = add_117 = None + hidden_states_98 = torch._C._nn.linear( + hidden_states_97, + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_97 = ( + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_19 = torch.nn.functional.dropout( + hidden_states_98, 0.1, False, False + ) + hidden_states_98 = None + add_118 = attn_outputs_19 + feed_forward_hidden_states_19 + attn_outputs_19 = feed_forward_hidden_states_19 = None + hidden_states_99 = add_118 + hidden_states_94 + add_118 = hidden_states_94 = None + hidden_states_100 = torch.nn.functional.layer_norm( + hidden_states_99, + (2560,), + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_ + ) = None + linear_120 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_61 = linear_120.view((1, 2, -1, 80)) + linear_120 = None + query_states_40 = view_61.transpose(1, 2) + view_61 = None + linear_121 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_62 = linear_121.view((1, 2, -1, 80)) + linear_121 = None + key_states_40 = view_62.transpose(1, 2) + view_62 = None + linear_122 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_63 = linear_122.view((1, 2, -1, 80)) + linear_122 = None + value_states_20 = view_63.transpose(1, 2) + view_63 = None + query_rot_20 = query_states_40[(Ellipsis, slice(None, 32, None))] + query_pass_20 = query_states_40[(Ellipsis, slice(32, None, None))] + query_states_40 = None + key_rot_20 = key_states_40[(Ellipsis, slice(None, 32, None))] + key_pass_20 = key_states_40[(Ellipsis, slice(32, None, None))] + key_states_40 = None + cos_23 = cos_2.unsqueeze(1) + sin_23 = sin_2.unsqueeze(1) + mul_163 = query_rot_20 * cos_23 + x1_40 = query_rot_20[(Ellipsis, slice(None, 16, None))] + x2_40 = query_rot_20[(Ellipsis, slice(16, None, None))] + query_rot_20 = None + neg_40 = -x2_40 + x2_40 = None + cat_81 = torch.cat((neg_40, x1_40), dim=-1) + neg_40 = x1_40 = None + mul_164 = cat_81 * sin_23 + cat_81 = None + q_embed_20 = mul_163 + mul_164 + mul_163 = mul_164 = None + mul_165 = key_rot_20 * cos_23 + cos_23 = None + x1_41 = key_rot_20[(Ellipsis, slice(None, 16, None))] + x2_41 = key_rot_20[(Ellipsis, slice(16, None, None))] + key_rot_20 = None + neg_41 = -x2_41 + x2_41 = None + cat_82 = torch.cat((neg_41, x1_41), dim=-1) + neg_41 = x1_41 = None + mul_166 = cat_82 * sin_23 + cat_82 = sin_23 = None + k_embed_20 = mul_165 + mul_166 + mul_165 = mul_166 = None + query_states_41 = torch.cat((q_embed_20, query_pass_20), dim=-1) + q_embed_20 = query_pass_20 = None + key_states_41 = torch.cat((k_embed_20, key_pass_20), dim=-1) + k_embed_20 = key_pass_20 = None + attention_mask_21 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_20 = query_states_41.contiguous() + query_states_41 = None + key_20 = key_states_41.contiguous() + value_20 = value_states_20.contiguous() + attn_output_80 = torch._C._nn.scaled_dot_product_attention( + query_20, + key_20, + value_20, + attn_mask=attention_mask_21, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_20 = key_20 = value_20 = attention_mask_21 = None + transpose_84 = attn_output_80.transpose(1, 2) + attn_output_80 = None + attn_output_81 = transpose_84.contiguous() + transpose_84 = None + reshape_20 = attn_output_81.reshape(1, 2, -1) + attn_output_81 = None + attn_output_82 = reshape_20.contiguous() + reshape_20 = None + attn_output_83 = torch._C._nn.linear( + attn_output_82, + l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_82 = l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_20 = torch.nn.functional.dropout(attn_output_83, 0.1, False, False) + attn_output_83 = None + hidden_states_101 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_100 = ( + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_167 = 0.5 * hidden_states_101 + pow_21 = torch.pow(hidden_states_101, 3.0) + mul_168 = 0.044715 * pow_21 + pow_21 = None + add_122 = hidden_states_101 + mul_168 + hidden_states_101 = mul_168 = None + mul_169 = 0.7978845608028654 * add_122 + add_122 = None + tanh_20 = torch.tanh(mul_169) + mul_169 = None + add_123 = 1.0 + tanh_20 + tanh_20 = None + hidden_states_102 = mul_167 * add_123 + mul_167 = add_123 = None + hidden_states_103 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_102 = ( + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_20 = torch.nn.functional.dropout( + hidden_states_103, 0.1, False, False + ) + hidden_states_103 = None + add_124 = attn_outputs_20 + feed_forward_hidden_states_20 + attn_outputs_20 = feed_forward_hidden_states_20 = None + hidden_states_104 = add_124 + hidden_states_99 + add_124 = hidden_states_99 = None + hidden_states_105 = torch.nn.functional.layer_norm( + hidden_states_104, + (2560,), + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_ + ) = None + linear_126 = torch._C._nn.linear( + hidden_states_105, + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_64 = linear_126.view((1, 2, -1, 80)) + linear_126 = None + query_states_42 = view_64.transpose(1, 2) + view_64 = None + linear_127 = torch._C._nn.linear( + hidden_states_105, + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_65 = linear_127.view((1, 2, -1, 80)) + linear_127 = None + key_states_42 = view_65.transpose(1, 2) + view_65 = None + linear_128 = torch._C._nn.linear( + hidden_states_105, + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_66 = linear_128.view((1, 2, -1, 80)) + linear_128 = None + value_states_21 = view_66.transpose(1, 2) + view_66 = None + query_rot_21 = query_states_42[(Ellipsis, slice(None, 32, None))] + query_pass_21 = query_states_42[(Ellipsis, slice(32, None, None))] + query_states_42 = None + key_rot_21 = key_states_42[(Ellipsis, slice(None, 32, None))] + key_pass_21 = key_states_42[(Ellipsis, slice(32, None, None))] + key_states_42 = None + cos_24 = cos_2.unsqueeze(1) + sin_24 = sin_2.unsqueeze(1) + mul_171 = query_rot_21 * cos_24 + x1_42 = query_rot_21[(Ellipsis, slice(None, 16, None))] + x2_42 = query_rot_21[(Ellipsis, slice(16, None, None))] + query_rot_21 = None + neg_42 = -x2_42 + x2_42 = None + cat_85 = torch.cat((neg_42, x1_42), dim=-1) + neg_42 = x1_42 = None + mul_172 = cat_85 * sin_24 + cat_85 = None + q_embed_21 = mul_171 + mul_172 + mul_171 = mul_172 = None + mul_173 = key_rot_21 * cos_24 + cos_24 = None + x1_43 = key_rot_21[(Ellipsis, slice(None, 16, None))] + x2_43 = key_rot_21[(Ellipsis, slice(16, None, None))] + key_rot_21 = None + neg_43 = -x2_43 + x2_43 = None + cat_86 = torch.cat((neg_43, x1_43), dim=-1) + neg_43 = x1_43 = None + mul_174 = cat_86 * sin_24 + cat_86 = sin_24 = None + k_embed_21 = mul_173 + mul_174 + mul_173 = mul_174 = None + query_states_43 = torch.cat((q_embed_21, query_pass_21), dim=-1) + q_embed_21 = query_pass_21 = None + key_states_43 = torch.cat((k_embed_21, key_pass_21), dim=-1) + k_embed_21 = key_pass_21 = None + attention_mask_22 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_21 = query_states_43.contiguous() + query_states_43 = None + key_21 = key_states_43.contiguous() + value_21 = value_states_21.contiguous() + attn_output_84 = torch._C._nn.scaled_dot_product_attention( + query_21, + key_21, + value_21, + attn_mask=attention_mask_22, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_21 = key_21 = value_21 = attention_mask_22 = None + transpose_88 = attn_output_84.transpose(1, 2) + attn_output_84 = None + attn_output_85 = transpose_88.contiguous() + transpose_88 = None + reshape_21 = attn_output_85.reshape(1, 2, -1) + attn_output_85 = None + attn_output_86 = reshape_21.contiguous() + reshape_21 = None + attn_output_87 = torch._C._nn.linear( + attn_output_86, + l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_86 = l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_21 = torch.nn.functional.dropout(attn_output_87, 0.1, False, False) + attn_output_87 = None + hidden_states_106 = torch._C._nn.linear( + hidden_states_105, + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_105 = ( + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_175 = 0.5 * hidden_states_106 + pow_22 = torch.pow(hidden_states_106, 3.0) + mul_176 = 0.044715 * pow_22 + pow_22 = None + add_128 = hidden_states_106 + mul_176 + hidden_states_106 = mul_176 = None + mul_177 = 0.7978845608028654 * add_128 + add_128 = None + tanh_21 = torch.tanh(mul_177) + mul_177 = None + add_129 = 1.0 + tanh_21 + tanh_21 = None + hidden_states_107 = mul_175 * add_129 + mul_175 = add_129 = None + hidden_states_108 = torch._C._nn.linear( + hidden_states_107, + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_107 = ( + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_21 = torch.nn.functional.dropout( + hidden_states_108, 0.1, False, False + ) + hidden_states_108 = None + add_130 = attn_outputs_21 + feed_forward_hidden_states_21 + attn_outputs_21 = feed_forward_hidden_states_21 = None + hidden_states_109 = add_130 + hidden_states_104 + add_130 = hidden_states_104 = None + hidden_states_110 = torch.nn.functional.layer_norm( + hidden_states_109, + (2560,), + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_ + ) = None + linear_132 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_67 = linear_132.view((1, 2, -1, 80)) + linear_132 = None + query_states_44 = view_67.transpose(1, 2) + view_67 = None + linear_133 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_68 = linear_133.view((1, 2, -1, 80)) + linear_133 = None + key_states_44 = view_68.transpose(1, 2) + view_68 = None + linear_134 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_69 = linear_134.view((1, 2, -1, 80)) + linear_134 = None + value_states_22 = view_69.transpose(1, 2) + view_69 = None + query_rot_22 = query_states_44[(Ellipsis, slice(None, 32, None))] + query_pass_22 = query_states_44[(Ellipsis, slice(32, None, None))] + query_states_44 = None + key_rot_22 = key_states_44[(Ellipsis, slice(None, 32, None))] + key_pass_22 = key_states_44[(Ellipsis, slice(32, None, None))] + key_states_44 = None + cos_25 = cos_2.unsqueeze(1) + sin_25 = sin_2.unsqueeze(1) + mul_179 = query_rot_22 * cos_25 + x1_44 = query_rot_22[(Ellipsis, slice(None, 16, None))] + x2_44 = query_rot_22[(Ellipsis, slice(16, None, None))] + query_rot_22 = None + neg_44 = -x2_44 + x2_44 = None + cat_89 = torch.cat((neg_44, x1_44), dim=-1) + neg_44 = x1_44 = None + mul_180 = cat_89 * sin_25 + cat_89 = None + q_embed_22 = mul_179 + mul_180 + mul_179 = mul_180 = None + mul_181 = key_rot_22 * cos_25 + cos_25 = None + x1_45 = key_rot_22[(Ellipsis, slice(None, 16, None))] + x2_45 = key_rot_22[(Ellipsis, slice(16, None, None))] + key_rot_22 = None + neg_45 = -x2_45 + x2_45 = None + cat_90 = torch.cat((neg_45, x1_45), dim=-1) + neg_45 = x1_45 = None + mul_182 = cat_90 * sin_25 + cat_90 = sin_25 = None + k_embed_22 = mul_181 + mul_182 + mul_181 = mul_182 = None + query_states_45 = torch.cat((q_embed_22, query_pass_22), dim=-1) + q_embed_22 = query_pass_22 = None + key_states_45 = torch.cat((k_embed_22, key_pass_22), dim=-1) + k_embed_22 = key_pass_22 = None + attention_mask_23 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_22 = query_states_45.contiguous() + query_states_45 = None + key_22 = key_states_45.contiguous() + value_22 = value_states_22.contiguous() + attn_output_88 = torch._C._nn.scaled_dot_product_attention( + query_22, + key_22, + value_22, + attn_mask=attention_mask_23, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_22 = key_22 = value_22 = attention_mask_23 = None + transpose_92 = attn_output_88.transpose(1, 2) + attn_output_88 = None + attn_output_89 = transpose_92.contiguous() + transpose_92 = None + reshape_22 = attn_output_89.reshape(1, 2, -1) + attn_output_89 = None + attn_output_90 = reshape_22.contiguous() + reshape_22 = None + attn_output_91 = torch._C._nn.linear( + attn_output_90, + l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_90 = l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_22 = torch.nn.functional.dropout(attn_output_91, 0.1, False, False) + attn_output_91 = None + hidden_states_111 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_110 = ( + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_183 = 0.5 * hidden_states_111 + pow_23 = torch.pow(hidden_states_111, 3.0) + mul_184 = 0.044715 * pow_23 + pow_23 = None + add_134 = hidden_states_111 + mul_184 + hidden_states_111 = mul_184 = None + mul_185 = 0.7978845608028654 * add_134 + add_134 = None + tanh_22 = torch.tanh(mul_185) + mul_185 = None + add_135 = 1.0 + tanh_22 + tanh_22 = None + hidden_states_112 = mul_183 * add_135 + mul_183 = add_135 = None + hidden_states_113 = torch._C._nn.linear( + hidden_states_112, + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_112 = ( + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_22 = torch.nn.functional.dropout( + hidden_states_113, 0.1, False, False + ) + hidden_states_113 = None + add_136 = attn_outputs_22 + feed_forward_hidden_states_22 + attn_outputs_22 = feed_forward_hidden_states_22 = None + hidden_states_114 = add_136 + hidden_states_109 + add_136 = hidden_states_109 = None + hidden_states_115 = torch.nn.functional.layer_norm( + hidden_states_114, + (2560,), + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_ + ) = None + linear_138 = torch._C._nn.linear( + hidden_states_115, + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_70 = linear_138.view((1, 2, -1, 80)) + linear_138 = None + query_states_46 = view_70.transpose(1, 2) + view_70 = None + linear_139 = torch._C._nn.linear( + hidden_states_115, + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_71 = linear_139.view((1, 2, -1, 80)) + linear_139 = None + key_states_46 = view_71.transpose(1, 2) + view_71 = None + linear_140 = torch._C._nn.linear( + hidden_states_115, + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_72 = linear_140.view((1, 2, -1, 80)) + linear_140 = None + value_states_23 = view_72.transpose(1, 2) + view_72 = None + query_rot_23 = query_states_46[(Ellipsis, slice(None, 32, None))] + query_pass_23 = query_states_46[(Ellipsis, slice(32, None, None))] + query_states_46 = None + key_rot_23 = key_states_46[(Ellipsis, slice(None, 32, None))] + key_pass_23 = key_states_46[(Ellipsis, slice(32, None, None))] + key_states_46 = None + cos_26 = cos_2.unsqueeze(1) + sin_26 = sin_2.unsqueeze(1) + mul_187 = query_rot_23 * cos_26 + x1_46 = query_rot_23[(Ellipsis, slice(None, 16, None))] + x2_46 = query_rot_23[(Ellipsis, slice(16, None, None))] + query_rot_23 = None + neg_46 = -x2_46 + x2_46 = None + cat_93 = torch.cat((neg_46, x1_46), dim=-1) + neg_46 = x1_46 = None + mul_188 = cat_93 * sin_26 + cat_93 = None + q_embed_23 = mul_187 + mul_188 + mul_187 = mul_188 = None + mul_189 = key_rot_23 * cos_26 + cos_26 = None + x1_47 = key_rot_23[(Ellipsis, slice(None, 16, None))] + x2_47 = key_rot_23[(Ellipsis, slice(16, None, None))] + key_rot_23 = None + neg_47 = -x2_47 + x2_47 = None + cat_94 = torch.cat((neg_47, x1_47), dim=-1) + neg_47 = x1_47 = None + mul_190 = cat_94 * sin_26 + cat_94 = sin_26 = None + k_embed_23 = mul_189 + mul_190 + mul_189 = mul_190 = None + query_states_47 = torch.cat((q_embed_23, query_pass_23), dim=-1) + q_embed_23 = query_pass_23 = None + key_states_47 = torch.cat((k_embed_23, key_pass_23), dim=-1) + k_embed_23 = key_pass_23 = None + attention_mask_24 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_23 = query_states_47.contiguous() + query_states_47 = None + key_23 = key_states_47.contiguous() + value_23 = value_states_23.contiguous() + attn_output_92 = torch._C._nn.scaled_dot_product_attention( + query_23, + key_23, + value_23, + attn_mask=attention_mask_24, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_23 = key_23 = value_23 = attention_mask_24 = None + transpose_96 = attn_output_92.transpose(1, 2) + attn_output_92 = None + attn_output_93 = transpose_96.contiguous() + transpose_96 = None + reshape_23 = attn_output_93.reshape(1, 2, -1) + attn_output_93 = None + attn_output_94 = reshape_23.contiguous() + reshape_23 = None + attn_output_95 = torch._C._nn.linear( + attn_output_94, + l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_94 = l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_23 = torch.nn.functional.dropout(attn_output_95, 0.1, False, False) + attn_output_95 = None + hidden_states_116 = torch._C._nn.linear( + hidden_states_115, + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_115 = ( + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_191 = 0.5 * hidden_states_116 + pow_24 = torch.pow(hidden_states_116, 3.0) + mul_192 = 0.044715 * pow_24 + pow_24 = None + add_140 = hidden_states_116 + mul_192 + hidden_states_116 = mul_192 = None + mul_193 = 0.7978845608028654 * add_140 + add_140 = None + tanh_23 = torch.tanh(mul_193) + mul_193 = None + add_141 = 1.0 + tanh_23 + tanh_23 = None + hidden_states_117 = mul_191 * add_141 + mul_191 = add_141 = None + hidden_states_118 = torch._C._nn.linear( + hidden_states_117, + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_117 = ( + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_23 = torch.nn.functional.dropout( + hidden_states_118, 0.1, False, False + ) + hidden_states_118 = None + add_142 = attn_outputs_23 + feed_forward_hidden_states_23 + attn_outputs_23 = feed_forward_hidden_states_23 = None + hidden_states_119 = add_142 + hidden_states_114 + add_142 = hidden_states_114 = None + hidden_states_120 = torch.nn.functional.layer_norm( + hidden_states_119, + (2560,), + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_bias_ + ) = None + linear_144 = torch._C._nn.linear( + hidden_states_120, + l_self_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_73 = linear_144.view((1, 2, -1, 80)) + linear_144 = None + query_states_48 = view_73.transpose(1, 2) + view_73 = None + linear_145 = torch._C._nn.linear( + hidden_states_120, + l_self_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_74 = linear_145.view((1, 2, -1, 80)) + linear_145 = None + key_states_48 = view_74.transpose(1, 2) + view_74 = None + linear_146 = torch._C._nn.linear( + hidden_states_120, + l_self_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_75 = linear_146.view((1, 2, -1, 80)) + linear_146 = None + value_states_24 = view_75.transpose(1, 2) + view_75 = None + query_rot_24 = query_states_48[(Ellipsis, slice(None, 32, None))] + query_pass_24 = query_states_48[(Ellipsis, slice(32, None, None))] + query_states_48 = None + key_rot_24 = key_states_48[(Ellipsis, slice(None, 32, None))] + key_pass_24 = key_states_48[(Ellipsis, slice(32, None, None))] + key_states_48 = None + cos_27 = cos_2.unsqueeze(1) + sin_27 = sin_2.unsqueeze(1) + mul_195 = query_rot_24 * cos_27 + x1_48 = query_rot_24[(Ellipsis, slice(None, 16, None))] + x2_48 = query_rot_24[(Ellipsis, slice(16, None, None))] + query_rot_24 = None + neg_48 = -x2_48 + x2_48 = None + cat_97 = torch.cat((neg_48, x1_48), dim=-1) + neg_48 = x1_48 = None + mul_196 = cat_97 * sin_27 + cat_97 = None + q_embed_24 = mul_195 + mul_196 + mul_195 = mul_196 = None + mul_197 = key_rot_24 * cos_27 + cos_27 = None + x1_49 = key_rot_24[(Ellipsis, slice(None, 16, None))] + x2_49 = key_rot_24[(Ellipsis, slice(16, None, None))] + key_rot_24 = None + neg_49 = -x2_49 + x2_49 = None + cat_98 = torch.cat((neg_49, x1_49), dim=-1) + neg_49 = x1_49 = None + mul_198 = cat_98 * sin_27 + cat_98 = sin_27 = None + k_embed_24 = mul_197 + mul_198 + mul_197 = mul_198 = None + query_states_49 = torch.cat((q_embed_24, query_pass_24), dim=-1) + q_embed_24 = query_pass_24 = None + key_states_49 = torch.cat((k_embed_24, key_pass_24), dim=-1) + k_embed_24 = key_pass_24 = None + attention_mask_25 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_24 = query_states_49.contiguous() + query_states_49 = None + key_24 = key_states_49.contiguous() + value_24 = value_states_24.contiguous() + attn_output_96 = torch._C._nn.scaled_dot_product_attention( + query_24, + key_24, + value_24, + attn_mask=attention_mask_25, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_24 = key_24 = value_24 = attention_mask_25 = None + transpose_100 = attn_output_96.transpose(1, 2) + attn_output_96 = None + attn_output_97 = transpose_100.contiguous() + transpose_100 = None + reshape_24 = attn_output_97.reshape(1, 2, -1) + attn_output_97 = None + attn_output_98 = reshape_24.contiguous() + reshape_24 = None + attn_output_99 = torch._C._nn.linear( + attn_output_98, + l_self_modules_layers_modules_24_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_24_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_98 = l_self_modules_layers_modules_24_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_24_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_24 = torch.nn.functional.dropout(attn_output_99, 0.1, False, False) + attn_output_99 = None + hidden_states_121 = torch._C._nn.linear( + hidden_states_120, + l_self_modules_layers_modules_24_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_24_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_120 = ( + l_self_modules_layers_modules_24_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_24_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_199 = 0.5 * hidden_states_121 + pow_25 = torch.pow(hidden_states_121, 3.0) + mul_200 = 0.044715 * pow_25 + pow_25 = None + add_146 = hidden_states_121 + mul_200 + hidden_states_121 = mul_200 = None + mul_201 = 0.7978845608028654 * add_146 + add_146 = None + tanh_24 = torch.tanh(mul_201) + mul_201 = None + add_147 = 1.0 + tanh_24 + tanh_24 = None + hidden_states_122 = mul_199 * add_147 + mul_199 = add_147 = None + hidden_states_123 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_layers_modules_24_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_24_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_122 = ( + l_self_modules_layers_modules_24_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_24_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_24 = torch.nn.functional.dropout( + hidden_states_123, 0.1, False, False + ) + hidden_states_123 = None + add_148 = attn_outputs_24 + feed_forward_hidden_states_24 + attn_outputs_24 = feed_forward_hidden_states_24 = None + hidden_states_124 = add_148 + hidden_states_119 + add_148 = hidden_states_119 = None + hidden_states_125 = torch.nn.functional.layer_norm( + hidden_states_124, + (2560,), + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_bias_ + ) = None + linear_150 = torch._C._nn.linear( + hidden_states_125, + l_self_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_76 = linear_150.view((1, 2, -1, 80)) + linear_150 = None + query_states_50 = view_76.transpose(1, 2) + view_76 = None + linear_151 = torch._C._nn.linear( + hidden_states_125, + l_self_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_77 = linear_151.view((1, 2, -1, 80)) + linear_151 = None + key_states_50 = view_77.transpose(1, 2) + view_77 = None + linear_152 = torch._C._nn.linear( + hidden_states_125, + l_self_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_78 = linear_152.view((1, 2, -1, 80)) + linear_152 = None + value_states_25 = view_78.transpose(1, 2) + view_78 = None + query_rot_25 = query_states_50[(Ellipsis, slice(None, 32, None))] + query_pass_25 = query_states_50[(Ellipsis, slice(32, None, None))] + query_states_50 = None + key_rot_25 = key_states_50[(Ellipsis, slice(None, 32, None))] + key_pass_25 = key_states_50[(Ellipsis, slice(32, None, None))] + key_states_50 = None + cos_28 = cos_2.unsqueeze(1) + sin_28 = sin_2.unsqueeze(1) + mul_203 = query_rot_25 * cos_28 + x1_50 = query_rot_25[(Ellipsis, slice(None, 16, None))] + x2_50 = query_rot_25[(Ellipsis, slice(16, None, None))] + query_rot_25 = None + neg_50 = -x2_50 + x2_50 = None + cat_101 = torch.cat((neg_50, x1_50), dim=-1) + neg_50 = x1_50 = None + mul_204 = cat_101 * sin_28 + cat_101 = None + q_embed_25 = mul_203 + mul_204 + mul_203 = mul_204 = None + mul_205 = key_rot_25 * cos_28 + cos_28 = None + x1_51 = key_rot_25[(Ellipsis, slice(None, 16, None))] + x2_51 = key_rot_25[(Ellipsis, slice(16, None, None))] + key_rot_25 = None + neg_51 = -x2_51 + x2_51 = None + cat_102 = torch.cat((neg_51, x1_51), dim=-1) + neg_51 = x1_51 = None + mul_206 = cat_102 * sin_28 + cat_102 = sin_28 = None + k_embed_25 = mul_205 + mul_206 + mul_205 = mul_206 = None + query_states_51 = torch.cat((q_embed_25, query_pass_25), dim=-1) + q_embed_25 = query_pass_25 = None + key_states_51 = torch.cat((k_embed_25, key_pass_25), dim=-1) + k_embed_25 = key_pass_25 = None + attention_mask_26 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_25 = query_states_51.contiguous() + query_states_51 = None + key_25 = key_states_51.contiguous() + value_25 = value_states_25.contiguous() + attn_output_100 = torch._C._nn.scaled_dot_product_attention( + query_25, + key_25, + value_25, + attn_mask=attention_mask_26, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_25 = key_25 = value_25 = attention_mask_26 = None + transpose_104 = attn_output_100.transpose(1, 2) + attn_output_100 = None + attn_output_101 = transpose_104.contiguous() + transpose_104 = None + reshape_25 = attn_output_101.reshape(1, 2, -1) + attn_output_101 = None + attn_output_102 = reshape_25.contiguous() + reshape_25 = None + attn_output_103 = torch._C._nn.linear( + attn_output_102, + l_self_modules_layers_modules_25_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_25_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_102 = l_self_modules_layers_modules_25_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_25_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_25 = torch.nn.functional.dropout( + attn_output_103, 0.1, False, False + ) + attn_output_103 = None + hidden_states_126 = torch._C._nn.linear( + hidden_states_125, + l_self_modules_layers_modules_25_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_25_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_125 = ( + l_self_modules_layers_modules_25_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_25_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_207 = 0.5 * hidden_states_126 + pow_26 = torch.pow(hidden_states_126, 3.0) + mul_208 = 0.044715 * pow_26 + pow_26 = None + add_152 = hidden_states_126 + mul_208 + hidden_states_126 = mul_208 = None + mul_209 = 0.7978845608028654 * add_152 + add_152 = None + tanh_25 = torch.tanh(mul_209) + mul_209 = None + add_153 = 1.0 + tanh_25 + tanh_25 = None + hidden_states_127 = mul_207 * add_153 + mul_207 = add_153 = None + hidden_states_128 = torch._C._nn.linear( + hidden_states_127, + l_self_modules_layers_modules_25_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_25_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_127 = ( + l_self_modules_layers_modules_25_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_25_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_25 = torch.nn.functional.dropout( + hidden_states_128, 0.1, False, False + ) + hidden_states_128 = None + add_154 = attn_outputs_25 + feed_forward_hidden_states_25 + attn_outputs_25 = feed_forward_hidden_states_25 = None + hidden_states_129 = add_154 + hidden_states_124 + add_154 = hidden_states_124 = None + hidden_states_130 = torch.nn.functional.layer_norm( + hidden_states_129, + (2560,), + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_bias_ + ) = None + linear_156 = torch._C._nn.linear( + hidden_states_130, + l_self_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_79 = linear_156.view((1, 2, -1, 80)) + linear_156 = None + query_states_52 = view_79.transpose(1, 2) + view_79 = None + linear_157 = torch._C._nn.linear( + hidden_states_130, + l_self_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_80 = linear_157.view((1, 2, -1, 80)) + linear_157 = None + key_states_52 = view_80.transpose(1, 2) + view_80 = None + linear_158 = torch._C._nn.linear( + hidden_states_130, + l_self_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_81 = linear_158.view((1, 2, -1, 80)) + linear_158 = None + value_states_26 = view_81.transpose(1, 2) + view_81 = None + query_rot_26 = query_states_52[(Ellipsis, slice(None, 32, None))] + query_pass_26 = query_states_52[(Ellipsis, slice(32, None, None))] + query_states_52 = None + key_rot_26 = key_states_52[(Ellipsis, slice(None, 32, None))] + key_pass_26 = key_states_52[(Ellipsis, slice(32, None, None))] + key_states_52 = None + cos_29 = cos_2.unsqueeze(1) + sin_29 = sin_2.unsqueeze(1) + mul_211 = query_rot_26 * cos_29 + x1_52 = query_rot_26[(Ellipsis, slice(None, 16, None))] + x2_52 = query_rot_26[(Ellipsis, slice(16, None, None))] + query_rot_26 = None + neg_52 = -x2_52 + x2_52 = None + cat_105 = torch.cat((neg_52, x1_52), dim=-1) + neg_52 = x1_52 = None + mul_212 = cat_105 * sin_29 + cat_105 = None + q_embed_26 = mul_211 + mul_212 + mul_211 = mul_212 = None + mul_213 = key_rot_26 * cos_29 + cos_29 = None + x1_53 = key_rot_26[(Ellipsis, slice(None, 16, None))] + x2_53 = key_rot_26[(Ellipsis, slice(16, None, None))] + key_rot_26 = None + neg_53 = -x2_53 + x2_53 = None + cat_106 = torch.cat((neg_53, x1_53), dim=-1) + neg_53 = x1_53 = None + mul_214 = cat_106 * sin_29 + cat_106 = sin_29 = None + k_embed_26 = mul_213 + mul_214 + mul_213 = mul_214 = None + query_states_53 = torch.cat((q_embed_26, query_pass_26), dim=-1) + q_embed_26 = query_pass_26 = None + key_states_53 = torch.cat((k_embed_26, key_pass_26), dim=-1) + k_embed_26 = key_pass_26 = None + attention_mask_27 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_26 = query_states_53.contiguous() + query_states_53 = None + key_26 = key_states_53.contiguous() + value_26 = value_states_26.contiguous() + attn_output_104 = torch._C._nn.scaled_dot_product_attention( + query_26, + key_26, + value_26, + attn_mask=attention_mask_27, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_26 = key_26 = value_26 = attention_mask_27 = None + transpose_108 = attn_output_104.transpose(1, 2) + attn_output_104 = None + attn_output_105 = transpose_108.contiguous() + transpose_108 = None + reshape_26 = attn_output_105.reshape(1, 2, -1) + attn_output_105 = None + attn_output_106 = reshape_26.contiguous() + reshape_26 = None + attn_output_107 = torch._C._nn.linear( + attn_output_106, + l_self_modules_layers_modules_26_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_26_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_106 = l_self_modules_layers_modules_26_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_26_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_26 = torch.nn.functional.dropout( + attn_output_107, 0.1, False, False + ) + attn_output_107 = None + hidden_states_131 = torch._C._nn.linear( + hidden_states_130, + l_self_modules_layers_modules_26_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_26_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_130 = ( + l_self_modules_layers_modules_26_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_26_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_215 = 0.5 * hidden_states_131 + pow_27 = torch.pow(hidden_states_131, 3.0) + mul_216 = 0.044715 * pow_27 + pow_27 = None + add_158 = hidden_states_131 + mul_216 + hidden_states_131 = mul_216 = None + mul_217 = 0.7978845608028654 * add_158 + add_158 = None + tanh_26 = torch.tanh(mul_217) + mul_217 = None + add_159 = 1.0 + tanh_26 + tanh_26 = None + hidden_states_132 = mul_215 * add_159 + mul_215 = add_159 = None + hidden_states_133 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_layers_modules_26_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_26_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_132 = ( + l_self_modules_layers_modules_26_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_26_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_26 = torch.nn.functional.dropout( + hidden_states_133, 0.1, False, False + ) + hidden_states_133 = None + add_160 = attn_outputs_26 + feed_forward_hidden_states_26 + attn_outputs_26 = feed_forward_hidden_states_26 = None + hidden_states_134 = add_160 + hidden_states_129 + add_160 = hidden_states_129 = None + hidden_states_135 = torch.nn.functional.layer_norm( + hidden_states_134, + (2560,), + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_bias_ + ) = None + linear_162 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_82 = linear_162.view((1, 2, -1, 80)) + linear_162 = None + query_states_54 = view_82.transpose(1, 2) + view_82 = None + linear_163 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_83 = linear_163.view((1, 2, -1, 80)) + linear_163 = None + key_states_54 = view_83.transpose(1, 2) + view_83 = None + linear_164 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_84 = linear_164.view((1, 2, -1, 80)) + linear_164 = None + value_states_27 = view_84.transpose(1, 2) + view_84 = None + query_rot_27 = query_states_54[(Ellipsis, slice(None, 32, None))] + query_pass_27 = query_states_54[(Ellipsis, slice(32, None, None))] + query_states_54 = None + key_rot_27 = key_states_54[(Ellipsis, slice(None, 32, None))] + key_pass_27 = key_states_54[(Ellipsis, slice(32, None, None))] + key_states_54 = None + cos_30 = cos_2.unsqueeze(1) + sin_30 = sin_2.unsqueeze(1) + mul_219 = query_rot_27 * cos_30 + x1_54 = query_rot_27[(Ellipsis, slice(None, 16, None))] + x2_54 = query_rot_27[(Ellipsis, slice(16, None, None))] + query_rot_27 = None + neg_54 = -x2_54 + x2_54 = None + cat_109 = torch.cat((neg_54, x1_54), dim=-1) + neg_54 = x1_54 = None + mul_220 = cat_109 * sin_30 + cat_109 = None + q_embed_27 = mul_219 + mul_220 + mul_219 = mul_220 = None + mul_221 = key_rot_27 * cos_30 + cos_30 = None + x1_55 = key_rot_27[(Ellipsis, slice(None, 16, None))] + x2_55 = key_rot_27[(Ellipsis, slice(16, None, None))] + key_rot_27 = None + neg_55 = -x2_55 + x2_55 = None + cat_110 = torch.cat((neg_55, x1_55), dim=-1) + neg_55 = x1_55 = None + mul_222 = cat_110 * sin_30 + cat_110 = sin_30 = None + k_embed_27 = mul_221 + mul_222 + mul_221 = mul_222 = None + query_states_55 = torch.cat((q_embed_27, query_pass_27), dim=-1) + q_embed_27 = query_pass_27 = None + key_states_55 = torch.cat((k_embed_27, key_pass_27), dim=-1) + k_embed_27 = key_pass_27 = None + attention_mask_28 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_27 = query_states_55.contiguous() + query_states_55 = None + key_27 = key_states_55.contiguous() + value_27 = value_states_27.contiguous() + attn_output_108 = torch._C._nn.scaled_dot_product_attention( + query_27, + key_27, + value_27, + attn_mask=attention_mask_28, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_27 = key_27 = value_27 = attention_mask_28 = None + transpose_112 = attn_output_108.transpose(1, 2) + attn_output_108 = None + attn_output_109 = transpose_112.contiguous() + transpose_112 = None + reshape_27 = attn_output_109.reshape(1, 2, -1) + attn_output_109 = None + attn_output_110 = reshape_27.contiguous() + reshape_27 = None + attn_output_111 = torch._C._nn.linear( + attn_output_110, + l_self_modules_layers_modules_27_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_27_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_110 = l_self_modules_layers_modules_27_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_27_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_27 = torch.nn.functional.dropout( + attn_output_111, 0.1, False, False + ) + attn_output_111 = None + hidden_states_136 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_layers_modules_27_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_27_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_135 = ( + l_self_modules_layers_modules_27_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_27_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_223 = 0.5 * hidden_states_136 + pow_28 = torch.pow(hidden_states_136, 3.0) + mul_224 = 0.044715 * pow_28 + pow_28 = None + add_164 = hidden_states_136 + mul_224 + hidden_states_136 = mul_224 = None + mul_225 = 0.7978845608028654 * add_164 + add_164 = None + tanh_27 = torch.tanh(mul_225) + mul_225 = None + add_165 = 1.0 + tanh_27 + tanh_27 = None + hidden_states_137 = mul_223 * add_165 + mul_223 = add_165 = None + hidden_states_138 = torch._C._nn.linear( + hidden_states_137, + l_self_modules_layers_modules_27_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_27_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_137 = ( + l_self_modules_layers_modules_27_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_27_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_27 = torch.nn.functional.dropout( + hidden_states_138, 0.1, False, False + ) + hidden_states_138 = None + add_166 = attn_outputs_27 + feed_forward_hidden_states_27 + attn_outputs_27 = feed_forward_hidden_states_27 = None + hidden_states_139 = add_166 + hidden_states_134 + add_166 = hidden_states_134 = None + hidden_states_140 = torch.nn.functional.layer_norm( + hidden_states_139, + (2560,), + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_bias_ + ) = None + linear_168 = torch._C._nn.linear( + hidden_states_140, + l_self_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_85 = linear_168.view((1, 2, -1, 80)) + linear_168 = None + query_states_56 = view_85.transpose(1, 2) + view_85 = None + linear_169 = torch._C._nn.linear( + hidden_states_140, + l_self_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_86 = linear_169.view((1, 2, -1, 80)) + linear_169 = None + key_states_56 = view_86.transpose(1, 2) + view_86 = None + linear_170 = torch._C._nn.linear( + hidden_states_140, + l_self_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_87 = linear_170.view((1, 2, -1, 80)) + linear_170 = None + value_states_28 = view_87.transpose(1, 2) + view_87 = None + query_rot_28 = query_states_56[(Ellipsis, slice(None, 32, None))] + query_pass_28 = query_states_56[(Ellipsis, slice(32, None, None))] + query_states_56 = None + key_rot_28 = key_states_56[(Ellipsis, slice(None, 32, None))] + key_pass_28 = key_states_56[(Ellipsis, slice(32, None, None))] + key_states_56 = None + cos_31 = cos_2.unsqueeze(1) + sin_31 = sin_2.unsqueeze(1) + mul_227 = query_rot_28 * cos_31 + x1_56 = query_rot_28[(Ellipsis, slice(None, 16, None))] + x2_56 = query_rot_28[(Ellipsis, slice(16, None, None))] + query_rot_28 = None + neg_56 = -x2_56 + x2_56 = None + cat_113 = torch.cat((neg_56, x1_56), dim=-1) + neg_56 = x1_56 = None + mul_228 = cat_113 * sin_31 + cat_113 = None + q_embed_28 = mul_227 + mul_228 + mul_227 = mul_228 = None + mul_229 = key_rot_28 * cos_31 + cos_31 = None + x1_57 = key_rot_28[(Ellipsis, slice(None, 16, None))] + x2_57 = key_rot_28[(Ellipsis, slice(16, None, None))] + key_rot_28 = None + neg_57 = -x2_57 + x2_57 = None + cat_114 = torch.cat((neg_57, x1_57), dim=-1) + neg_57 = x1_57 = None + mul_230 = cat_114 * sin_31 + cat_114 = sin_31 = None + k_embed_28 = mul_229 + mul_230 + mul_229 = mul_230 = None + query_states_57 = torch.cat((q_embed_28, query_pass_28), dim=-1) + q_embed_28 = query_pass_28 = None + key_states_57 = torch.cat((k_embed_28, key_pass_28), dim=-1) + k_embed_28 = key_pass_28 = None + attention_mask_29 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_28 = query_states_57.contiguous() + query_states_57 = None + key_28 = key_states_57.contiguous() + value_28 = value_states_28.contiguous() + attn_output_112 = torch._C._nn.scaled_dot_product_attention( + query_28, + key_28, + value_28, + attn_mask=attention_mask_29, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_28 = key_28 = value_28 = attention_mask_29 = None + transpose_116 = attn_output_112.transpose(1, 2) + attn_output_112 = None + attn_output_113 = transpose_116.contiguous() + transpose_116 = None + reshape_28 = attn_output_113.reshape(1, 2, -1) + attn_output_113 = None + attn_output_114 = reshape_28.contiguous() + reshape_28 = None + attn_output_115 = torch._C._nn.linear( + attn_output_114, + l_self_modules_layers_modules_28_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_28_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_114 = l_self_modules_layers_modules_28_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_28_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_28 = torch.nn.functional.dropout( + attn_output_115, 0.1, False, False + ) + attn_output_115 = None + hidden_states_141 = torch._C._nn.linear( + hidden_states_140, + l_self_modules_layers_modules_28_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_28_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_140 = ( + l_self_modules_layers_modules_28_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_28_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_231 = 0.5 * hidden_states_141 + pow_29 = torch.pow(hidden_states_141, 3.0) + mul_232 = 0.044715 * pow_29 + pow_29 = None + add_170 = hidden_states_141 + mul_232 + hidden_states_141 = mul_232 = None + mul_233 = 0.7978845608028654 * add_170 + add_170 = None + tanh_28 = torch.tanh(mul_233) + mul_233 = None + add_171 = 1.0 + tanh_28 + tanh_28 = None + hidden_states_142 = mul_231 * add_171 + mul_231 = add_171 = None + hidden_states_143 = torch._C._nn.linear( + hidden_states_142, + l_self_modules_layers_modules_28_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_28_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_142 = ( + l_self_modules_layers_modules_28_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_28_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_28 = torch.nn.functional.dropout( + hidden_states_143, 0.1, False, False + ) + hidden_states_143 = None + add_172 = attn_outputs_28 + feed_forward_hidden_states_28 + attn_outputs_28 = feed_forward_hidden_states_28 = None + hidden_states_144 = add_172 + hidden_states_139 + add_172 = hidden_states_139 = None + hidden_states_145 = torch.nn.functional.layer_norm( + hidden_states_144, + (2560,), + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_bias_ + ) = None + linear_174 = torch._C._nn.linear( + hidden_states_145, + l_self_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_88 = linear_174.view((1, 2, -1, 80)) + linear_174 = None + query_states_58 = view_88.transpose(1, 2) + view_88 = None + linear_175 = torch._C._nn.linear( + hidden_states_145, + l_self_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_89 = linear_175.view((1, 2, -1, 80)) + linear_175 = None + key_states_58 = view_89.transpose(1, 2) + view_89 = None + linear_176 = torch._C._nn.linear( + hidden_states_145, + l_self_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_90 = linear_176.view((1, 2, -1, 80)) + linear_176 = None + value_states_29 = view_90.transpose(1, 2) + view_90 = None + query_rot_29 = query_states_58[(Ellipsis, slice(None, 32, None))] + query_pass_29 = query_states_58[(Ellipsis, slice(32, None, None))] + query_states_58 = None + key_rot_29 = key_states_58[(Ellipsis, slice(None, 32, None))] + key_pass_29 = key_states_58[(Ellipsis, slice(32, None, None))] + key_states_58 = None + cos_32 = cos_2.unsqueeze(1) + sin_32 = sin_2.unsqueeze(1) + mul_235 = query_rot_29 * cos_32 + x1_58 = query_rot_29[(Ellipsis, slice(None, 16, None))] + x2_58 = query_rot_29[(Ellipsis, slice(16, None, None))] + query_rot_29 = None + neg_58 = -x2_58 + x2_58 = None + cat_117 = torch.cat((neg_58, x1_58), dim=-1) + neg_58 = x1_58 = None + mul_236 = cat_117 * sin_32 + cat_117 = None + q_embed_29 = mul_235 + mul_236 + mul_235 = mul_236 = None + mul_237 = key_rot_29 * cos_32 + cos_32 = None + x1_59 = key_rot_29[(Ellipsis, slice(None, 16, None))] + x2_59 = key_rot_29[(Ellipsis, slice(16, None, None))] + key_rot_29 = None + neg_59 = -x2_59 + x2_59 = None + cat_118 = torch.cat((neg_59, x1_59), dim=-1) + neg_59 = x1_59 = None + mul_238 = cat_118 * sin_32 + cat_118 = sin_32 = None + k_embed_29 = mul_237 + mul_238 + mul_237 = mul_238 = None + query_states_59 = torch.cat((q_embed_29, query_pass_29), dim=-1) + q_embed_29 = query_pass_29 = None + key_states_59 = torch.cat((k_embed_29, key_pass_29), dim=-1) + k_embed_29 = key_pass_29 = None + attention_mask_30 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_29 = query_states_59.contiguous() + query_states_59 = None + key_29 = key_states_59.contiguous() + value_29 = value_states_29.contiguous() + attn_output_116 = torch._C._nn.scaled_dot_product_attention( + query_29, + key_29, + value_29, + attn_mask=attention_mask_30, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_29 = key_29 = value_29 = attention_mask_30 = None + transpose_120 = attn_output_116.transpose(1, 2) + attn_output_116 = None + attn_output_117 = transpose_120.contiguous() + transpose_120 = None + reshape_29 = attn_output_117.reshape(1, 2, -1) + attn_output_117 = None + attn_output_118 = reshape_29.contiguous() + reshape_29 = None + attn_output_119 = torch._C._nn.linear( + attn_output_118, + l_self_modules_layers_modules_29_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_29_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_118 = l_self_modules_layers_modules_29_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_29_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_29 = torch.nn.functional.dropout( + attn_output_119, 0.1, False, False + ) + attn_output_119 = None + hidden_states_146 = torch._C._nn.linear( + hidden_states_145, + l_self_modules_layers_modules_29_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_29_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_145 = ( + l_self_modules_layers_modules_29_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_29_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_239 = 0.5 * hidden_states_146 + pow_30 = torch.pow(hidden_states_146, 3.0) + mul_240 = 0.044715 * pow_30 + pow_30 = None + add_176 = hidden_states_146 + mul_240 + hidden_states_146 = mul_240 = None + mul_241 = 0.7978845608028654 * add_176 + add_176 = None + tanh_29 = torch.tanh(mul_241) + mul_241 = None + add_177 = 1.0 + tanh_29 + tanh_29 = None + hidden_states_147 = mul_239 * add_177 + mul_239 = add_177 = None + hidden_states_148 = torch._C._nn.linear( + hidden_states_147, + l_self_modules_layers_modules_29_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_29_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_147 = ( + l_self_modules_layers_modules_29_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_29_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_29 = torch.nn.functional.dropout( + hidden_states_148, 0.1, False, False + ) + hidden_states_148 = None + add_178 = attn_outputs_29 + feed_forward_hidden_states_29 + attn_outputs_29 = feed_forward_hidden_states_29 = None + hidden_states_149 = add_178 + hidden_states_144 + add_178 = hidden_states_144 = None + hidden_states_150 = torch.nn.functional.layer_norm( + hidden_states_149, + (2560,), + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_bias_ + ) = None + linear_180 = torch._C._nn.linear( + hidden_states_150, + l_self_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_91 = linear_180.view((1, 2, -1, 80)) + linear_180 = None + query_states_60 = view_91.transpose(1, 2) + view_91 = None + linear_181 = torch._C._nn.linear( + hidden_states_150, + l_self_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_92 = linear_181.view((1, 2, -1, 80)) + linear_181 = None + key_states_60 = view_92.transpose(1, 2) + view_92 = None + linear_182 = torch._C._nn.linear( + hidden_states_150, + l_self_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_93 = linear_182.view((1, 2, -1, 80)) + linear_182 = None + value_states_30 = view_93.transpose(1, 2) + view_93 = None + query_rot_30 = query_states_60[(Ellipsis, slice(None, 32, None))] + query_pass_30 = query_states_60[(Ellipsis, slice(32, None, None))] + query_states_60 = None + key_rot_30 = key_states_60[(Ellipsis, slice(None, 32, None))] + key_pass_30 = key_states_60[(Ellipsis, slice(32, None, None))] + key_states_60 = None + cos_33 = cos_2.unsqueeze(1) + sin_33 = sin_2.unsqueeze(1) + mul_243 = query_rot_30 * cos_33 + x1_60 = query_rot_30[(Ellipsis, slice(None, 16, None))] + x2_60 = query_rot_30[(Ellipsis, slice(16, None, None))] + query_rot_30 = None + neg_60 = -x2_60 + x2_60 = None + cat_121 = torch.cat((neg_60, x1_60), dim=-1) + neg_60 = x1_60 = None + mul_244 = cat_121 * sin_33 + cat_121 = None + q_embed_30 = mul_243 + mul_244 + mul_243 = mul_244 = None + mul_245 = key_rot_30 * cos_33 + cos_33 = None + x1_61 = key_rot_30[(Ellipsis, slice(None, 16, None))] + x2_61 = key_rot_30[(Ellipsis, slice(16, None, None))] + key_rot_30 = None + neg_61 = -x2_61 + x2_61 = None + cat_122 = torch.cat((neg_61, x1_61), dim=-1) + neg_61 = x1_61 = None + mul_246 = cat_122 * sin_33 + cat_122 = sin_33 = None + k_embed_30 = mul_245 + mul_246 + mul_245 = mul_246 = None + query_states_61 = torch.cat((q_embed_30, query_pass_30), dim=-1) + q_embed_30 = query_pass_30 = None + key_states_61 = torch.cat((k_embed_30, key_pass_30), dim=-1) + k_embed_30 = key_pass_30 = None + attention_mask_31 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_30 = query_states_61.contiguous() + query_states_61 = None + key_30 = key_states_61.contiguous() + value_30 = value_states_30.contiguous() + attn_output_120 = torch._C._nn.scaled_dot_product_attention( + query_30, + key_30, + value_30, + attn_mask=attention_mask_31, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_30 = key_30 = value_30 = attention_mask_31 = None + transpose_124 = attn_output_120.transpose(1, 2) + attn_output_120 = None + attn_output_121 = transpose_124.contiguous() + transpose_124 = None + reshape_30 = attn_output_121.reshape(1, 2, -1) + attn_output_121 = None + attn_output_122 = reshape_30.contiguous() + reshape_30 = None + attn_output_123 = torch._C._nn.linear( + attn_output_122, + l_self_modules_layers_modules_30_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_30_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_122 = l_self_modules_layers_modules_30_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_30_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_30 = torch.nn.functional.dropout( + attn_output_123, 0.1, False, False + ) + attn_output_123 = None + hidden_states_151 = torch._C._nn.linear( + hidden_states_150, + l_self_modules_layers_modules_30_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_30_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_150 = ( + l_self_modules_layers_modules_30_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_30_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_247 = 0.5 * hidden_states_151 + pow_31 = torch.pow(hidden_states_151, 3.0) + mul_248 = 0.044715 * pow_31 + pow_31 = None + add_182 = hidden_states_151 + mul_248 + hidden_states_151 = mul_248 = None + mul_249 = 0.7978845608028654 * add_182 + add_182 = None + tanh_30 = torch.tanh(mul_249) + mul_249 = None + add_183 = 1.0 + tanh_30 + tanh_30 = None + hidden_states_152 = mul_247 * add_183 + mul_247 = add_183 = None + hidden_states_153 = torch._C._nn.linear( + hidden_states_152, + l_self_modules_layers_modules_30_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_30_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_152 = ( + l_self_modules_layers_modules_30_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_30_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_30 = torch.nn.functional.dropout( + hidden_states_153, 0.1, False, False + ) + hidden_states_153 = None + add_184 = attn_outputs_30 + feed_forward_hidden_states_30 + attn_outputs_30 = feed_forward_hidden_states_30 = None + hidden_states_154 = add_184 + hidden_states_149 + add_184 = hidden_states_149 = None + hidden_states_155 = torch.nn.functional.layer_norm( + hidden_states_154, + (2560,), + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_bias_ + ) = None + linear_186 = torch._C._nn.linear( + hidden_states_155, + l_self_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_94 = linear_186.view((1, 2, -1, 80)) + linear_186 = None + query_states_62 = view_94.transpose(1, 2) + view_94 = None + linear_187 = torch._C._nn.linear( + hidden_states_155, + l_self_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_95 = linear_187.view((1, 2, -1, 80)) + linear_187 = None + key_states_62 = view_95.transpose(1, 2) + view_95 = None + linear_188 = torch._C._nn.linear( + hidden_states_155, + l_self_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_bias_, + ) + l_self_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_96 = linear_188.view((1, 2, -1, 80)) + linear_188 = None + value_states_31 = view_96.transpose(1, 2) + view_96 = None + query_rot_31 = query_states_62[(Ellipsis, slice(None, 32, None))] + query_pass_31 = query_states_62[(Ellipsis, slice(32, None, None))] + query_states_62 = None + key_rot_31 = key_states_62[(Ellipsis, slice(None, 32, None))] + key_pass_31 = key_states_62[(Ellipsis, slice(32, None, None))] + key_states_62 = None + cos_34 = cos_2.unsqueeze(1) + cos_2 = None + sin_34 = sin_2.unsqueeze(1) + sin_2 = None + mul_251 = query_rot_31 * cos_34 + x1_62 = query_rot_31[(Ellipsis, slice(None, 16, None))] + x2_62 = query_rot_31[(Ellipsis, slice(16, None, None))] + query_rot_31 = None + neg_62 = -x2_62 + x2_62 = None + cat_125 = torch.cat((neg_62, x1_62), dim=-1) + neg_62 = x1_62 = None + mul_252 = cat_125 * sin_34 + cat_125 = None + q_embed_31 = mul_251 + mul_252 + mul_251 = mul_252 = None + mul_253 = key_rot_31 * cos_34 + cos_34 = None + x1_63 = key_rot_31[(Ellipsis, slice(None, 16, None))] + x2_63 = key_rot_31[(Ellipsis, slice(16, None, None))] + key_rot_31 = None + neg_63 = -x2_63 + x2_63 = None + cat_126 = torch.cat((neg_63, x1_63), dim=-1) + neg_63 = x1_63 = None + mul_254 = cat_126 * sin_34 + cat_126 = sin_34 = None + k_embed_31 = mul_253 + mul_254 + mul_253 = mul_254 = None + query_states_63 = torch.cat((q_embed_31, query_pass_31), dim=-1) + q_embed_31 = query_pass_31 = None + key_states_63 = torch.cat((k_embed_31, key_pass_31), dim=-1) + k_embed_31 = key_pass_31 = None + attention_mask_32 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + causal_mask_2 = None + query_31 = query_states_63.contiguous() + query_states_63 = None + key_31 = key_states_63.contiguous() + value_31 = value_states_31.contiguous() + attn_output_124 = torch._C._nn.scaled_dot_product_attention( + query_31, + key_31, + value_31, + attn_mask=attention_mask_32, + dropout_p=0.0, + scale=0.11180339887498948, + is_causal=False, + ) + query_31 = key_31 = value_31 = attention_mask_32 = None + transpose_128 = attn_output_124.transpose(1, 2) + attn_output_124 = None + attn_output_125 = transpose_128.contiguous() + transpose_128 = None + reshape_31 = attn_output_125.reshape(1, 2, -1) + attn_output_125 = None + attn_output_126 = reshape_31.contiguous() + reshape_31 = None + attn_output_127 = torch._C._nn.linear( + attn_output_126, + l_self_modules_layers_modules_31_modules_self_attn_modules_dense_parameters_weight_, + l_self_modules_layers_modules_31_modules_self_attn_modules_dense_parameters_bias_, + ) + attn_output_126 = l_self_modules_layers_modules_31_modules_self_attn_modules_dense_parameters_weight_ = l_self_modules_layers_modules_31_modules_self_attn_modules_dense_parameters_bias_ = (None) + attn_outputs_31 = torch.nn.functional.dropout( + attn_output_127, 0.1, False, False + ) + attn_output_127 = None + hidden_states_156 = torch._C._nn.linear( + hidden_states_155, + l_self_modules_layers_modules_31_modules_mlp_modules_fc1_parameters_weight_, + l_self_modules_layers_modules_31_modules_mlp_modules_fc1_parameters_bias_, + ) + hidden_states_155 = ( + l_self_modules_layers_modules_31_modules_mlp_modules_fc1_parameters_weight_ + ) = ( + l_self_modules_layers_modules_31_modules_mlp_modules_fc1_parameters_bias_ + ) = None + mul_255 = 0.5 * hidden_states_156 + pow_32 = torch.pow(hidden_states_156, 3.0) + mul_256 = 0.044715 * pow_32 + pow_32 = None + add_188 = hidden_states_156 + mul_256 + hidden_states_156 = mul_256 = None + mul_257 = 0.7978845608028654 * add_188 + add_188 = None + tanh_31 = torch.tanh(mul_257) + mul_257 = None + add_189 = 1.0 + tanh_31 + tanh_31 = None + hidden_states_157 = mul_255 * add_189 + mul_255 = add_189 = None + hidden_states_158 = torch._C._nn.linear( + hidden_states_157, + l_self_modules_layers_modules_31_modules_mlp_modules_fc2_parameters_weight_, + l_self_modules_layers_modules_31_modules_mlp_modules_fc2_parameters_bias_, + ) + hidden_states_157 = ( + l_self_modules_layers_modules_31_modules_mlp_modules_fc2_parameters_weight_ + ) = ( + l_self_modules_layers_modules_31_modules_mlp_modules_fc2_parameters_bias_ + ) = None + feed_forward_hidden_states_31 = torch.nn.functional.dropout( + hidden_states_158, 0.1, False, False + ) + hidden_states_158 = None + add_190 = attn_outputs_31 + feed_forward_hidden_states_31 + attn_outputs_31 = feed_forward_hidden_states_31 = None + hidden_states_159 = add_190 + hidden_states_154 + add_190 = hidden_states_154 = None + hidden_states_160 = torch.nn.functional.layer_norm( + hidden_states_159, + (2560,), + l_self_modules_final_layernorm_parameters_weight_, + l_self_modules_final_layernorm_parameters_bias_, + 1e-05, + ) + hidden_states_159 = ( + l_self_modules_final_layernorm_parameters_weight_ + ) = l_self_modules_final_layernorm_parameters_bias_ = None + return ( + value_states, + key_states_1, + value_states_1, + key_states_3, + value_states_2, + key_states_5, + value_states_3, + key_states_7, + value_states_4, + key_states_9, + value_states_5, + key_states_11, + value_states_6, + key_states_13, + value_states_7, + key_states_15, + value_states_8, + key_states_17, + value_states_9, + key_states_19, + value_states_10, + key_states_21, + value_states_11, + key_states_23, + value_states_12, + key_states_25, + value_states_13, + key_states_27, + value_states_14, + key_states_29, + value_states_15, + key_states_31, + value_states_16, + key_states_33, + value_states_17, + key_states_35, + value_states_18, + key_states_37, + value_states_19, + key_states_39, + value_states_20, + key_states_41, + value_states_21, + key_states_43, + value_states_22, + key_states_45, + value_states_23, + key_states_47, + value_states_24, + key_states_49, + value_states_25, + key_states_51, + value_states_26, + key_states_53, + value_states_27, + key_states_55, + value_states_28, + key_states_57, + value_states_29, + key_states_59, + value_states_30, + key_states_61, + value_states_31, + key_states_63, + hidden_states_160, + ) diff --git a/samples/transformers-auto-model/microsoft/phi-2/weight_meta.py b/samples/transformers-auto-model/microsoft/phi-2/weight_meta.py new file mode 100644 index 000000000..d4cf86b4b --- /dev/null +++ b/samples/transformers-auto-model/microsoft/phi-2/weight_meta.py @@ -0,0 +1,4545 @@ +class Program_weight_tensor_meta_L_inputs_embeds_: + name = "L_inputs_embeds_" + shape = [1, 2, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 2] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1] + + +class Program_weight_tensor_meta_L_self_modules_rotary_emb_buffers_inv_freq_: + name = "L_self_modules_rotary_emb_buffers_inv_freq_" + shape = [16] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.143 + std = 0.275 + data = [ + 1.000000, + 0.562341, + 0.316228, + 0.177828, + 0.100000, + 0.056234, + 0.031623, + 0.017783, + 0.010000, + 0.005623, + 0.003162, + 0.001778, + 0.001000, + 0.000562, + 0.000316, + 0.000178, + ] + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_16_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_17_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_18_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_19_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_20_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_21_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_22_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_23_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_24_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_24_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_24_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_24_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_24_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_24_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_25_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_25_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_25_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_25_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_25_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_25_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_26_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_26_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_26_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_26_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_26_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_26_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_27_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_27_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_27_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_27_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_27_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_27_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_28_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_28_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_28_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_28_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_28_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_28_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_29_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_29_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_29_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_29_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_29_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_29_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_30_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_30_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_30_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_30_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_30_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_30_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_31_modules_input_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_self_attn_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_self_attn_modules_dense_parameters_weight_" + shape = [2560, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_self_attn_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_31_modules_self_attn_modules_dense_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_mlp_modules_fc1_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_mlp_modules_fc1_parameters_weight_" + shape = [10240, 2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_mlp_modules_fc1_parameters_bias_: + name = "L_self_modules_layers_modules_31_modules_mlp_modules_fc1_parameters_bias_" + shape = [10240] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_mlp_modules_fc2_parameters_weight_: + name = "L_self_modules_layers_modules_31_modules_mlp_modules_fc2_parameters_weight_" + shape = [2560, 10240] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_31_modules_mlp_modules_fc2_parameters_bias_: + name = "L_self_modules_layers_modules_31_modules_mlp_modules_fc2_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_final_layernorm_parameters_weight_: + name = "L_self_modules_final_layernorm_parameters_weight_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_final_layernorm_parameters_bias_: + name = "L_self_modules_final_layernorm_parameters_bias_" + shape = [2560] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/openai-community/gpt2/graph_hash.txt b/samples/transformers-auto-model/openai-community/gpt2/graph_hash.txt new file mode 100644 index 000000000..d7beb5fe9 --- /dev/null +++ b/samples/transformers-auto-model/openai-community/gpt2/graph_hash.txt @@ -0,0 +1 @@ +364cd95b661841fd355c33810d1cd27ffa64dbbbfd8ab9338b8fbfabe0f65683 \ No newline at end of file diff --git a/samples/transformers-auto-model/openai-community/gpt2/graph_net.json b/samples/transformers-auto-model/openai-community/gpt2/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/openai-community/gpt2/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/openai-community/gpt2/input_meta.py b/samples/transformers-auto-model/openai-community/gpt2/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/openai-community/gpt2/input_tensor_constraints.py b/samples/transformers-auto-model/openai-community/gpt2/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/openai-community/gpt2/model.py b/samples/transformers-auto-model/openai-community/gpt2/model.py new file mode 100644 index 000000000..d3ac0ba84 --- /dev/null +++ b/samples/transformers-auto-model/openai-community/gpt2/model.py @@ -0,0 +1,2449 @@ +import torch + +from torch import device + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_self_modules_wte_parameters_weight_: torch.nn.parameter.Parameter, + L_input_ids_: torch.Tensor, + L_self_modules_wpe_parameters_weight_: torch.nn.parameter.Parameter, + L_attention_mask_: torch.Tensor, + L_self_modules_h_modules_0_modules_ln_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_ln_1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_attn_modules_c_attn_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_attn_modules_c_attn_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_attn_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_attn_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_ln_2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_ln_2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_mlp_modules_c_fc_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_mlp_modules_c_fc_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_mlp_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_mlp_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_ln_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_ln_1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_attn_modules_c_attn_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_attn_modules_c_attn_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_attn_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_attn_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_ln_2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_ln_2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_mlp_modules_c_fc_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_mlp_modules_c_fc_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_mlp_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_mlp_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_ln_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_ln_1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_attn_modules_c_attn_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_attn_modules_c_attn_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_attn_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_attn_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_ln_2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_ln_2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_mlp_modules_c_fc_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_mlp_modules_c_fc_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_mlp_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_mlp_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_ln_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_ln_1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_attn_modules_c_attn_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_attn_modules_c_attn_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_attn_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_attn_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_ln_2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_ln_2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_mlp_modules_c_fc_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_mlp_modules_c_fc_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_mlp_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_mlp_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_ln_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_ln_1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_attn_modules_c_attn_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_attn_modules_c_attn_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_attn_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_attn_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_ln_2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_ln_2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_mlp_modules_c_fc_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_mlp_modules_c_fc_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_mlp_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_mlp_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_ln_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_ln_1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_attn_modules_c_attn_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_attn_modules_c_attn_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_attn_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_attn_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_ln_2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_ln_2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_mlp_modules_c_fc_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_mlp_modules_c_fc_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_mlp_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_mlp_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_ln_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_ln_1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_attn_modules_c_attn_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_attn_modules_c_attn_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_attn_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_attn_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_ln_2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_ln_2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_mlp_modules_c_fc_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_mlp_modules_c_fc_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_mlp_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_mlp_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_ln_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_ln_1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_attn_modules_c_attn_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_attn_modules_c_attn_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_attn_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_attn_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_ln_2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_ln_2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_mlp_modules_c_fc_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_mlp_modules_c_fc_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_mlp_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_mlp_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_ln_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_ln_1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_attn_modules_c_attn_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_attn_modules_c_attn_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_attn_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_attn_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_ln_2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_ln_2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_mlp_modules_c_fc_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_mlp_modules_c_fc_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_mlp_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_mlp_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_ln_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_ln_1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_attn_modules_c_attn_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_attn_modules_c_attn_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_attn_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_attn_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_ln_2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_ln_2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_mlp_modules_c_fc_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_mlp_modules_c_fc_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_mlp_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_mlp_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_ln_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_ln_1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_attn_modules_c_attn_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_attn_modules_c_attn_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_attn_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_attn_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_ln_2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_ln_2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_mlp_modules_c_fc_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_mlp_modules_c_fc_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_mlp_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_mlp_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_ln_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_ln_1_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_attn_modules_c_attn_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_attn_modules_c_attn_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_attn_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_attn_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_ln_2_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_ln_2_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_mlp_modules_c_fc_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_mlp_modules_c_fc_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_mlp_modules_c_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_mlp_modules_c_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_ln_f_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_ln_f_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_self_modules_wte_parameters_weight_ = L_self_modules_wte_parameters_weight_ + l_input_ids_ = L_input_ids_ + l_self_modules_wpe_parameters_weight_ = L_self_modules_wpe_parameters_weight_ + l_attention_mask_ = L_attention_mask_ + l_self_modules_h_modules_0_modules_ln_1_parameters_weight_ = ( + L_self_modules_h_modules_0_modules_ln_1_parameters_weight_ + ) + l_self_modules_h_modules_0_modules_ln_1_parameters_bias_ = ( + L_self_modules_h_modules_0_modules_ln_1_parameters_bias_ + ) + l_self_modules_h_modules_0_modules_attn_modules_c_attn_parameters_bias_ = ( + L_self_modules_h_modules_0_modules_attn_modules_c_attn_parameters_bias_ + ) + l_self_modules_h_modules_0_modules_attn_modules_c_attn_parameters_weight_ = ( + L_self_modules_h_modules_0_modules_attn_modules_c_attn_parameters_weight_ + ) + l_self_modules_h_modules_0_modules_attn_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_0_modules_attn_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_0_modules_attn_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_0_modules_attn_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_0_modules_ln_2_parameters_weight_ = ( + L_self_modules_h_modules_0_modules_ln_2_parameters_weight_ + ) + l_self_modules_h_modules_0_modules_ln_2_parameters_bias_ = ( + L_self_modules_h_modules_0_modules_ln_2_parameters_bias_ + ) + l_self_modules_h_modules_0_modules_mlp_modules_c_fc_parameters_bias_ = ( + L_self_modules_h_modules_0_modules_mlp_modules_c_fc_parameters_bias_ + ) + l_self_modules_h_modules_0_modules_mlp_modules_c_fc_parameters_weight_ = ( + L_self_modules_h_modules_0_modules_mlp_modules_c_fc_parameters_weight_ + ) + l_self_modules_h_modules_0_modules_mlp_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_0_modules_mlp_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_0_modules_mlp_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_0_modules_mlp_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_1_modules_ln_1_parameters_weight_ = ( + L_self_modules_h_modules_1_modules_ln_1_parameters_weight_ + ) + l_self_modules_h_modules_1_modules_ln_1_parameters_bias_ = ( + L_self_modules_h_modules_1_modules_ln_1_parameters_bias_ + ) + l_self_modules_h_modules_1_modules_attn_modules_c_attn_parameters_bias_ = ( + L_self_modules_h_modules_1_modules_attn_modules_c_attn_parameters_bias_ + ) + l_self_modules_h_modules_1_modules_attn_modules_c_attn_parameters_weight_ = ( + L_self_modules_h_modules_1_modules_attn_modules_c_attn_parameters_weight_ + ) + l_self_modules_h_modules_1_modules_attn_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_1_modules_attn_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_1_modules_attn_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_1_modules_attn_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_1_modules_ln_2_parameters_weight_ = ( + L_self_modules_h_modules_1_modules_ln_2_parameters_weight_ + ) + l_self_modules_h_modules_1_modules_ln_2_parameters_bias_ = ( + L_self_modules_h_modules_1_modules_ln_2_parameters_bias_ + ) + l_self_modules_h_modules_1_modules_mlp_modules_c_fc_parameters_bias_ = ( + L_self_modules_h_modules_1_modules_mlp_modules_c_fc_parameters_bias_ + ) + l_self_modules_h_modules_1_modules_mlp_modules_c_fc_parameters_weight_ = ( + L_self_modules_h_modules_1_modules_mlp_modules_c_fc_parameters_weight_ + ) + l_self_modules_h_modules_1_modules_mlp_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_1_modules_mlp_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_1_modules_mlp_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_1_modules_mlp_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_2_modules_ln_1_parameters_weight_ = ( + L_self_modules_h_modules_2_modules_ln_1_parameters_weight_ + ) + l_self_modules_h_modules_2_modules_ln_1_parameters_bias_ = ( + L_self_modules_h_modules_2_modules_ln_1_parameters_bias_ + ) + l_self_modules_h_modules_2_modules_attn_modules_c_attn_parameters_bias_ = ( + L_self_modules_h_modules_2_modules_attn_modules_c_attn_parameters_bias_ + ) + l_self_modules_h_modules_2_modules_attn_modules_c_attn_parameters_weight_ = ( + L_self_modules_h_modules_2_modules_attn_modules_c_attn_parameters_weight_ + ) + l_self_modules_h_modules_2_modules_attn_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_2_modules_attn_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_2_modules_attn_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_2_modules_attn_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_2_modules_ln_2_parameters_weight_ = ( + L_self_modules_h_modules_2_modules_ln_2_parameters_weight_ + ) + l_self_modules_h_modules_2_modules_ln_2_parameters_bias_ = ( + L_self_modules_h_modules_2_modules_ln_2_parameters_bias_ + ) + l_self_modules_h_modules_2_modules_mlp_modules_c_fc_parameters_bias_ = ( + L_self_modules_h_modules_2_modules_mlp_modules_c_fc_parameters_bias_ + ) + l_self_modules_h_modules_2_modules_mlp_modules_c_fc_parameters_weight_ = ( + L_self_modules_h_modules_2_modules_mlp_modules_c_fc_parameters_weight_ + ) + l_self_modules_h_modules_2_modules_mlp_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_2_modules_mlp_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_2_modules_mlp_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_2_modules_mlp_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_3_modules_ln_1_parameters_weight_ = ( + L_self_modules_h_modules_3_modules_ln_1_parameters_weight_ + ) + l_self_modules_h_modules_3_modules_ln_1_parameters_bias_ = ( + L_self_modules_h_modules_3_modules_ln_1_parameters_bias_ + ) + l_self_modules_h_modules_3_modules_attn_modules_c_attn_parameters_bias_ = ( + L_self_modules_h_modules_3_modules_attn_modules_c_attn_parameters_bias_ + ) + l_self_modules_h_modules_3_modules_attn_modules_c_attn_parameters_weight_ = ( + L_self_modules_h_modules_3_modules_attn_modules_c_attn_parameters_weight_ + ) + l_self_modules_h_modules_3_modules_attn_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_3_modules_attn_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_3_modules_attn_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_3_modules_attn_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_3_modules_ln_2_parameters_weight_ = ( + L_self_modules_h_modules_3_modules_ln_2_parameters_weight_ + ) + l_self_modules_h_modules_3_modules_ln_2_parameters_bias_ = ( + L_self_modules_h_modules_3_modules_ln_2_parameters_bias_ + ) + l_self_modules_h_modules_3_modules_mlp_modules_c_fc_parameters_bias_ = ( + L_self_modules_h_modules_3_modules_mlp_modules_c_fc_parameters_bias_ + ) + l_self_modules_h_modules_3_modules_mlp_modules_c_fc_parameters_weight_ = ( + L_self_modules_h_modules_3_modules_mlp_modules_c_fc_parameters_weight_ + ) + l_self_modules_h_modules_3_modules_mlp_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_3_modules_mlp_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_3_modules_mlp_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_3_modules_mlp_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_4_modules_ln_1_parameters_weight_ = ( + L_self_modules_h_modules_4_modules_ln_1_parameters_weight_ + ) + l_self_modules_h_modules_4_modules_ln_1_parameters_bias_ = ( + L_self_modules_h_modules_4_modules_ln_1_parameters_bias_ + ) + l_self_modules_h_modules_4_modules_attn_modules_c_attn_parameters_bias_ = ( + L_self_modules_h_modules_4_modules_attn_modules_c_attn_parameters_bias_ + ) + l_self_modules_h_modules_4_modules_attn_modules_c_attn_parameters_weight_ = ( + L_self_modules_h_modules_4_modules_attn_modules_c_attn_parameters_weight_ + ) + l_self_modules_h_modules_4_modules_attn_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_4_modules_attn_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_4_modules_attn_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_4_modules_attn_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_4_modules_ln_2_parameters_weight_ = ( + L_self_modules_h_modules_4_modules_ln_2_parameters_weight_ + ) + l_self_modules_h_modules_4_modules_ln_2_parameters_bias_ = ( + L_self_modules_h_modules_4_modules_ln_2_parameters_bias_ + ) + l_self_modules_h_modules_4_modules_mlp_modules_c_fc_parameters_bias_ = ( + L_self_modules_h_modules_4_modules_mlp_modules_c_fc_parameters_bias_ + ) + l_self_modules_h_modules_4_modules_mlp_modules_c_fc_parameters_weight_ = ( + L_self_modules_h_modules_4_modules_mlp_modules_c_fc_parameters_weight_ + ) + l_self_modules_h_modules_4_modules_mlp_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_4_modules_mlp_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_4_modules_mlp_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_4_modules_mlp_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_5_modules_ln_1_parameters_weight_ = ( + L_self_modules_h_modules_5_modules_ln_1_parameters_weight_ + ) + l_self_modules_h_modules_5_modules_ln_1_parameters_bias_ = ( + L_self_modules_h_modules_5_modules_ln_1_parameters_bias_ + ) + l_self_modules_h_modules_5_modules_attn_modules_c_attn_parameters_bias_ = ( + L_self_modules_h_modules_5_modules_attn_modules_c_attn_parameters_bias_ + ) + l_self_modules_h_modules_5_modules_attn_modules_c_attn_parameters_weight_ = ( + L_self_modules_h_modules_5_modules_attn_modules_c_attn_parameters_weight_ + ) + l_self_modules_h_modules_5_modules_attn_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_5_modules_attn_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_5_modules_attn_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_5_modules_attn_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_5_modules_ln_2_parameters_weight_ = ( + L_self_modules_h_modules_5_modules_ln_2_parameters_weight_ + ) + l_self_modules_h_modules_5_modules_ln_2_parameters_bias_ = ( + L_self_modules_h_modules_5_modules_ln_2_parameters_bias_ + ) + l_self_modules_h_modules_5_modules_mlp_modules_c_fc_parameters_bias_ = ( + L_self_modules_h_modules_5_modules_mlp_modules_c_fc_parameters_bias_ + ) + l_self_modules_h_modules_5_modules_mlp_modules_c_fc_parameters_weight_ = ( + L_self_modules_h_modules_5_modules_mlp_modules_c_fc_parameters_weight_ + ) + l_self_modules_h_modules_5_modules_mlp_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_5_modules_mlp_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_5_modules_mlp_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_5_modules_mlp_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_6_modules_ln_1_parameters_weight_ = ( + L_self_modules_h_modules_6_modules_ln_1_parameters_weight_ + ) + l_self_modules_h_modules_6_modules_ln_1_parameters_bias_ = ( + L_self_modules_h_modules_6_modules_ln_1_parameters_bias_ + ) + l_self_modules_h_modules_6_modules_attn_modules_c_attn_parameters_bias_ = ( + L_self_modules_h_modules_6_modules_attn_modules_c_attn_parameters_bias_ + ) + l_self_modules_h_modules_6_modules_attn_modules_c_attn_parameters_weight_ = ( + L_self_modules_h_modules_6_modules_attn_modules_c_attn_parameters_weight_ + ) + l_self_modules_h_modules_6_modules_attn_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_6_modules_attn_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_6_modules_attn_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_6_modules_attn_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_6_modules_ln_2_parameters_weight_ = ( + L_self_modules_h_modules_6_modules_ln_2_parameters_weight_ + ) + l_self_modules_h_modules_6_modules_ln_2_parameters_bias_ = ( + L_self_modules_h_modules_6_modules_ln_2_parameters_bias_ + ) + l_self_modules_h_modules_6_modules_mlp_modules_c_fc_parameters_bias_ = ( + L_self_modules_h_modules_6_modules_mlp_modules_c_fc_parameters_bias_ + ) + l_self_modules_h_modules_6_modules_mlp_modules_c_fc_parameters_weight_ = ( + L_self_modules_h_modules_6_modules_mlp_modules_c_fc_parameters_weight_ + ) + l_self_modules_h_modules_6_modules_mlp_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_6_modules_mlp_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_6_modules_mlp_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_6_modules_mlp_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_7_modules_ln_1_parameters_weight_ = ( + L_self_modules_h_modules_7_modules_ln_1_parameters_weight_ + ) + l_self_modules_h_modules_7_modules_ln_1_parameters_bias_ = ( + L_self_modules_h_modules_7_modules_ln_1_parameters_bias_ + ) + l_self_modules_h_modules_7_modules_attn_modules_c_attn_parameters_bias_ = ( + L_self_modules_h_modules_7_modules_attn_modules_c_attn_parameters_bias_ + ) + l_self_modules_h_modules_7_modules_attn_modules_c_attn_parameters_weight_ = ( + L_self_modules_h_modules_7_modules_attn_modules_c_attn_parameters_weight_ + ) + l_self_modules_h_modules_7_modules_attn_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_7_modules_attn_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_7_modules_attn_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_7_modules_attn_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_7_modules_ln_2_parameters_weight_ = ( + L_self_modules_h_modules_7_modules_ln_2_parameters_weight_ + ) + l_self_modules_h_modules_7_modules_ln_2_parameters_bias_ = ( + L_self_modules_h_modules_7_modules_ln_2_parameters_bias_ + ) + l_self_modules_h_modules_7_modules_mlp_modules_c_fc_parameters_bias_ = ( + L_self_modules_h_modules_7_modules_mlp_modules_c_fc_parameters_bias_ + ) + l_self_modules_h_modules_7_modules_mlp_modules_c_fc_parameters_weight_ = ( + L_self_modules_h_modules_7_modules_mlp_modules_c_fc_parameters_weight_ + ) + l_self_modules_h_modules_7_modules_mlp_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_7_modules_mlp_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_7_modules_mlp_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_7_modules_mlp_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_8_modules_ln_1_parameters_weight_ = ( + L_self_modules_h_modules_8_modules_ln_1_parameters_weight_ + ) + l_self_modules_h_modules_8_modules_ln_1_parameters_bias_ = ( + L_self_modules_h_modules_8_modules_ln_1_parameters_bias_ + ) + l_self_modules_h_modules_8_modules_attn_modules_c_attn_parameters_bias_ = ( + L_self_modules_h_modules_8_modules_attn_modules_c_attn_parameters_bias_ + ) + l_self_modules_h_modules_8_modules_attn_modules_c_attn_parameters_weight_ = ( + L_self_modules_h_modules_8_modules_attn_modules_c_attn_parameters_weight_ + ) + l_self_modules_h_modules_8_modules_attn_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_8_modules_attn_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_8_modules_attn_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_8_modules_attn_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_8_modules_ln_2_parameters_weight_ = ( + L_self_modules_h_modules_8_modules_ln_2_parameters_weight_ + ) + l_self_modules_h_modules_8_modules_ln_2_parameters_bias_ = ( + L_self_modules_h_modules_8_modules_ln_2_parameters_bias_ + ) + l_self_modules_h_modules_8_modules_mlp_modules_c_fc_parameters_bias_ = ( + L_self_modules_h_modules_8_modules_mlp_modules_c_fc_parameters_bias_ + ) + l_self_modules_h_modules_8_modules_mlp_modules_c_fc_parameters_weight_ = ( + L_self_modules_h_modules_8_modules_mlp_modules_c_fc_parameters_weight_ + ) + l_self_modules_h_modules_8_modules_mlp_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_8_modules_mlp_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_8_modules_mlp_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_8_modules_mlp_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_9_modules_ln_1_parameters_weight_ = ( + L_self_modules_h_modules_9_modules_ln_1_parameters_weight_ + ) + l_self_modules_h_modules_9_modules_ln_1_parameters_bias_ = ( + L_self_modules_h_modules_9_modules_ln_1_parameters_bias_ + ) + l_self_modules_h_modules_9_modules_attn_modules_c_attn_parameters_bias_ = ( + L_self_modules_h_modules_9_modules_attn_modules_c_attn_parameters_bias_ + ) + l_self_modules_h_modules_9_modules_attn_modules_c_attn_parameters_weight_ = ( + L_self_modules_h_modules_9_modules_attn_modules_c_attn_parameters_weight_ + ) + l_self_modules_h_modules_9_modules_attn_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_9_modules_attn_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_9_modules_attn_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_9_modules_attn_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_9_modules_ln_2_parameters_weight_ = ( + L_self_modules_h_modules_9_modules_ln_2_parameters_weight_ + ) + l_self_modules_h_modules_9_modules_ln_2_parameters_bias_ = ( + L_self_modules_h_modules_9_modules_ln_2_parameters_bias_ + ) + l_self_modules_h_modules_9_modules_mlp_modules_c_fc_parameters_bias_ = ( + L_self_modules_h_modules_9_modules_mlp_modules_c_fc_parameters_bias_ + ) + l_self_modules_h_modules_9_modules_mlp_modules_c_fc_parameters_weight_ = ( + L_self_modules_h_modules_9_modules_mlp_modules_c_fc_parameters_weight_ + ) + l_self_modules_h_modules_9_modules_mlp_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_9_modules_mlp_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_9_modules_mlp_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_9_modules_mlp_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_10_modules_ln_1_parameters_weight_ = ( + L_self_modules_h_modules_10_modules_ln_1_parameters_weight_ + ) + l_self_modules_h_modules_10_modules_ln_1_parameters_bias_ = ( + L_self_modules_h_modules_10_modules_ln_1_parameters_bias_ + ) + l_self_modules_h_modules_10_modules_attn_modules_c_attn_parameters_bias_ = ( + L_self_modules_h_modules_10_modules_attn_modules_c_attn_parameters_bias_ + ) + l_self_modules_h_modules_10_modules_attn_modules_c_attn_parameters_weight_ = ( + L_self_modules_h_modules_10_modules_attn_modules_c_attn_parameters_weight_ + ) + l_self_modules_h_modules_10_modules_attn_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_10_modules_attn_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_10_modules_attn_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_10_modules_attn_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_10_modules_ln_2_parameters_weight_ = ( + L_self_modules_h_modules_10_modules_ln_2_parameters_weight_ + ) + l_self_modules_h_modules_10_modules_ln_2_parameters_bias_ = ( + L_self_modules_h_modules_10_modules_ln_2_parameters_bias_ + ) + l_self_modules_h_modules_10_modules_mlp_modules_c_fc_parameters_bias_ = ( + L_self_modules_h_modules_10_modules_mlp_modules_c_fc_parameters_bias_ + ) + l_self_modules_h_modules_10_modules_mlp_modules_c_fc_parameters_weight_ = ( + L_self_modules_h_modules_10_modules_mlp_modules_c_fc_parameters_weight_ + ) + l_self_modules_h_modules_10_modules_mlp_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_10_modules_mlp_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_10_modules_mlp_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_10_modules_mlp_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_11_modules_ln_1_parameters_weight_ = ( + L_self_modules_h_modules_11_modules_ln_1_parameters_weight_ + ) + l_self_modules_h_modules_11_modules_ln_1_parameters_bias_ = ( + L_self_modules_h_modules_11_modules_ln_1_parameters_bias_ + ) + l_self_modules_h_modules_11_modules_attn_modules_c_attn_parameters_bias_ = ( + L_self_modules_h_modules_11_modules_attn_modules_c_attn_parameters_bias_ + ) + l_self_modules_h_modules_11_modules_attn_modules_c_attn_parameters_weight_ = ( + L_self_modules_h_modules_11_modules_attn_modules_c_attn_parameters_weight_ + ) + l_self_modules_h_modules_11_modules_attn_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_11_modules_attn_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_11_modules_attn_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_11_modules_attn_modules_c_proj_parameters_weight_ + ) + l_self_modules_h_modules_11_modules_ln_2_parameters_weight_ = ( + L_self_modules_h_modules_11_modules_ln_2_parameters_weight_ + ) + l_self_modules_h_modules_11_modules_ln_2_parameters_bias_ = ( + L_self_modules_h_modules_11_modules_ln_2_parameters_bias_ + ) + l_self_modules_h_modules_11_modules_mlp_modules_c_fc_parameters_bias_ = ( + L_self_modules_h_modules_11_modules_mlp_modules_c_fc_parameters_bias_ + ) + l_self_modules_h_modules_11_modules_mlp_modules_c_fc_parameters_weight_ = ( + L_self_modules_h_modules_11_modules_mlp_modules_c_fc_parameters_weight_ + ) + l_self_modules_h_modules_11_modules_mlp_modules_c_proj_parameters_bias_ = ( + L_self_modules_h_modules_11_modules_mlp_modules_c_proj_parameters_bias_ + ) + l_self_modules_h_modules_11_modules_mlp_modules_c_proj_parameters_weight_ = ( + L_self_modules_h_modules_11_modules_mlp_modules_c_proj_parameters_weight_ + ) + l_self_modules_ln_f_parameters_weight_ = L_self_modules_ln_f_parameters_weight_ + l_self_modules_ln_f_parameters_bias_ = L_self_modules_ln_f_parameters_bias_ + inputs_embeds = torch.nn.functional.embedding( + l_input_ids_, + l_self_modules_wte_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + l_input_ids_ = l_self_modules_wte_parameters_weight_ = None + cache_position = torch.arange(0, 2, device=device(type="cuda", index=0)) + position_ids = cache_position.unsqueeze(0) + position_embeds = torch.nn.functional.embedding( + position_ids, + l_self_modules_wpe_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + position_ids = l_self_modules_wpe_parameters_weight_ = None + to = position_embeds.to(device(type="cuda", index=0)) + position_embeds = None + hidden_states = inputs_embeds + to + inputs_embeds = to = None + attention_mask = l_attention_mask_.view(1, -1) + l_attention_mask_ = None + attention_mask_1 = attention_mask.to( + device=device(type="cuda", index=0), dtype=torch.bool + ) + attention_mask = None + mask_indices = torch.arange(2, device=device(type="cuda", index=0)) + mask_indices += 0 + mask_indices_1 = mask_indices + mask_indices = None + local_padding_mask = attention_mask_1[(slice(None, None, None), mask_indices_1)] + attention_mask_1 = mask_indices_1 = None + kv_arange = torch.arange(2, device=device(type="cuda", index=0)) + kv_arange += 0 + kv_arange_1 = kv_arange + kv_arange = None + reshaped_cache_position = cache_position.view(-1, 1) + cache_position = None + causal_mask = kv_arange_1 <= reshaped_cache_position + kv_arange_1 = reshaped_cache_position = None + getitem_1 = causal_mask[ + (None, None, slice(None, None, None), slice(None, None, None)) + ] + causal_mask = None + causal_mask_1 = getitem_1.expand(1, -1, -1, -1) + getitem_1 = None + getitem_2 = local_padding_mask[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + local_padding_mask = None + causal_mask_2 = causal_mask_1 * getitem_2 + causal_mask_1 = getitem_2 = None + hidden_states_1 = torch.nn.functional.dropout(hidden_states, 0.1, False, False) + hidden_states = None + hidden_states_2 = torch.nn.functional.layer_norm( + hidden_states_1, + (768,), + l_self_modules_h_modules_0_modules_ln_1_parameters_weight_, + l_self_modules_h_modules_0_modules_ln_1_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_0_modules_ln_1_parameters_weight_ = ( + l_self_modules_h_modules_0_modules_ln_1_parameters_bias_ + ) = None + view_2 = hidden_states_2.view(-1, 768) + hidden_states_2 = None + x = torch.addmm( + l_self_modules_h_modules_0_modules_attn_modules_c_attn_parameters_bias_, + view_2, + l_self_modules_h_modules_0_modules_attn_modules_c_attn_parameters_weight_, + ) + l_self_modules_h_modules_0_modules_attn_modules_c_attn_parameters_bias_ = ( + view_2 + ) = ( + l_self_modules_h_modules_0_modules_attn_modules_c_attn_parameters_weight_ + ) = None + x_1 = x.view((1, 2, 2304)) + x = None + split = x_1.split(768, dim=2) + x_1 = None + query_states = split[0] + key_states = split[1] + value_states = split[2] + split = None + view_4 = key_states.view((1, 2, -1, 64)) + key_states = None + key_states_1 = view_4.transpose(1, 2) + view_4 = None + view_5 = value_states.view((1, 2, -1, 64)) + value_states = None + value_states_1 = view_5.transpose(1, 2) + view_5 = None + view_6 = query_states.view((1, 2, -1, 64)) + query_states = None + query_states_1 = view_6.transpose(1, 2) + view_6 = None + attention_mask_2 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query = query_states_1.contiguous() + query_states_1 = None + key = key_states_1.contiguous() + value = value_states_1.contiguous() + attn_output = torch._C._nn.scaled_dot_product_attention( + query, + key, + value, + attn_mask=attention_mask_2, + dropout_p=0.0, + scale=None, + is_causal=False, + ) + query = key = value = attention_mask_2 = None + transpose_3 = attn_output.transpose(1, 2) + attn_output = None + attn_output_1 = transpose_3.contiguous() + transpose_3 = None + reshape = attn_output_1.reshape(1, 2, -1) + attn_output_1 = None + attn_output_2 = reshape.contiguous() + reshape = None + view_7 = attn_output_2.view(-1, 768) + attn_output_2 = None + x_2 = torch.addmm( + l_self_modules_h_modules_0_modules_attn_modules_c_proj_parameters_bias_, + view_7, + l_self_modules_h_modules_0_modules_attn_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_0_modules_attn_modules_c_proj_parameters_bias_ = ( + view_7 + ) = ( + l_self_modules_h_modules_0_modules_attn_modules_c_proj_parameters_weight_ + ) = None + x_3 = x_2.view((1, 2, 768)) + x_2 = None + attn_output_3 = torch.nn.functional.dropout(x_3, 0.1, False, False) + x_3 = None + hidden_states_3 = attn_output_3 + hidden_states_1 + attn_output_3 = hidden_states_1 = None + hidden_states_4 = torch.nn.functional.layer_norm( + hidden_states_3, + (768,), + l_self_modules_h_modules_0_modules_ln_2_parameters_weight_, + l_self_modules_h_modules_0_modules_ln_2_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_0_modules_ln_2_parameters_weight_ = ( + l_self_modules_h_modules_0_modules_ln_2_parameters_bias_ + ) = None + view_9 = hidden_states_4.view(-1, 768) + hidden_states_4 = None + x_4 = torch.addmm( + l_self_modules_h_modules_0_modules_mlp_modules_c_fc_parameters_bias_, + view_9, + l_self_modules_h_modules_0_modules_mlp_modules_c_fc_parameters_weight_, + ) + l_self_modules_h_modules_0_modules_mlp_modules_c_fc_parameters_bias_ = ( + view_9 + ) = ( + l_self_modules_h_modules_0_modules_mlp_modules_c_fc_parameters_weight_ + ) = None + x_5 = x_4.view((1, 2, 3072)) + x_4 = None + mul_1 = 0.5 * x_5 + pow_1 = torch.pow(x_5, 3.0) + mul_2 = 0.044715 * pow_1 + pow_1 = None + add_2 = x_5 + mul_2 + x_5 = mul_2 = None + mul_3 = 0.7978845608028654 * add_2 + add_2 = None + tanh = torch.tanh(mul_3) + mul_3 = None + add_3 = 1.0 + tanh + tanh = None + hidden_states_5 = mul_1 * add_3 + mul_1 = add_3 = None + view_11 = hidden_states_5.view(-1, 3072) + hidden_states_5 = None + x_6 = torch.addmm( + l_self_modules_h_modules_0_modules_mlp_modules_c_proj_parameters_bias_, + view_11, + l_self_modules_h_modules_0_modules_mlp_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_0_modules_mlp_modules_c_proj_parameters_bias_ = ( + view_11 + ) = ( + l_self_modules_h_modules_0_modules_mlp_modules_c_proj_parameters_weight_ + ) = None + x_7 = x_6.view((1, 2, 768)) + x_6 = None + hidden_states_6 = torch.nn.functional.dropout(x_7, 0.1, False, False) + x_7 = None + hidden_states_7 = hidden_states_3 + hidden_states_6 + hidden_states_3 = hidden_states_6 = None + hidden_states_8 = torch.nn.functional.layer_norm( + hidden_states_7, + (768,), + l_self_modules_h_modules_1_modules_ln_1_parameters_weight_, + l_self_modules_h_modules_1_modules_ln_1_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_1_modules_ln_1_parameters_weight_ = ( + l_self_modules_h_modules_1_modules_ln_1_parameters_bias_ + ) = None + view_13 = hidden_states_8.view(-1, 768) + hidden_states_8 = None + x_8 = torch.addmm( + l_self_modules_h_modules_1_modules_attn_modules_c_attn_parameters_bias_, + view_13, + l_self_modules_h_modules_1_modules_attn_modules_c_attn_parameters_weight_, + ) + l_self_modules_h_modules_1_modules_attn_modules_c_attn_parameters_bias_ = ( + view_13 + ) = ( + l_self_modules_h_modules_1_modules_attn_modules_c_attn_parameters_weight_ + ) = None + x_9 = x_8.view((1, 2, 2304)) + x_8 = None + split_1 = x_9.split(768, dim=2) + x_9 = None + query_states_2 = split_1[0] + key_states_2 = split_1[1] + value_states_2 = split_1[2] + split_1 = None + view_15 = key_states_2.view((1, 2, -1, 64)) + key_states_2 = None + key_states_3 = view_15.transpose(1, 2) + view_15 = None + view_16 = value_states_2.view((1, 2, -1, 64)) + value_states_2 = None + value_states_3 = view_16.transpose(1, 2) + view_16 = None + view_17 = query_states_2.view((1, 2, -1, 64)) + query_states_2 = None + query_states_3 = view_17.transpose(1, 2) + view_17 = None + attention_mask_3 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_1 = query_states_3.contiguous() + query_states_3 = None + key_1 = key_states_3.contiguous() + value_1 = value_states_3.contiguous() + attn_output_4 = torch._C._nn.scaled_dot_product_attention( + query_1, + key_1, + value_1, + attn_mask=attention_mask_3, + dropout_p=0.0, + scale=None, + is_causal=False, + ) + query_1 = key_1 = value_1 = attention_mask_3 = None + transpose_7 = attn_output_4.transpose(1, 2) + attn_output_4 = None + attn_output_5 = transpose_7.contiguous() + transpose_7 = None + reshape_1 = attn_output_5.reshape(1, 2, -1) + attn_output_5 = None + attn_output_6 = reshape_1.contiguous() + reshape_1 = None + view_18 = attn_output_6.view(-1, 768) + attn_output_6 = None + x_10 = torch.addmm( + l_self_modules_h_modules_1_modules_attn_modules_c_proj_parameters_bias_, + view_18, + l_self_modules_h_modules_1_modules_attn_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_1_modules_attn_modules_c_proj_parameters_bias_ = ( + view_18 + ) = ( + l_self_modules_h_modules_1_modules_attn_modules_c_proj_parameters_weight_ + ) = None + x_11 = x_10.view((1, 2, 768)) + x_10 = None + attn_output_7 = torch.nn.functional.dropout(x_11, 0.1, False, False) + x_11 = None + hidden_states_9 = attn_output_7 + hidden_states_7 + attn_output_7 = hidden_states_7 = None + hidden_states_10 = torch.nn.functional.layer_norm( + hidden_states_9, + (768,), + l_self_modules_h_modules_1_modules_ln_2_parameters_weight_, + l_self_modules_h_modules_1_modules_ln_2_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_1_modules_ln_2_parameters_weight_ = ( + l_self_modules_h_modules_1_modules_ln_2_parameters_bias_ + ) = None + view_20 = hidden_states_10.view(-1, 768) + hidden_states_10 = None + x_12 = torch.addmm( + l_self_modules_h_modules_1_modules_mlp_modules_c_fc_parameters_bias_, + view_20, + l_self_modules_h_modules_1_modules_mlp_modules_c_fc_parameters_weight_, + ) + l_self_modules_h_modules_1_modules_mlp_modules_c_fc_parameters_bias_ = ( + view_20 + ) = ( + l_self_modules_h_modules_1_modules_mlp_modules_c_fc_parameters_weight_ + ) = None + x_13 = x_12.view((1, 2, 3072)) + x_12 = None + mul_5 = 0.5 * x_13 + pow_2 = torch.pow(x_13, 3.0) + mul_6 = 0.044715 * pow_2 + pow_2 = None + add_6 = x_13 + mul_6 + x_13 = mul_6 = None + mul_7 = 0.7978845608028654 * add_6 + add_6 = None + tanh_1 = torch.tanh(mul_7) + mul_7 = None + add_7 = 1.0 + tanh_1 + tanh_1 = None + hidden_states_11 = mul_5 * add_7 + mul_5 = add_7 = None + view_22 = hidden_states_11.view(-1, 3072) + hidden_states_11 = None + x_14 = torch.addmm( + l_self_modules_h_modules_1_modules_mlp_modules_c_proj_parameters_bias_, + view_22, + l_self_modules_h_modules_1_modules_mlp_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_1_modules_mlp_modules_c_proj_parameters_bias_ = ( + view_22 + ) = ( + l_self_modules_h_modules_1_modules_mlp_modules_c_proj_parameters_weight_ + ) = None + x_15 = x_14.view((1, 2, 768)) + x_14 = None + hidden_states_12 = torch.nn.functional.dropout(x_15, 0.1, False, False) + x_15 = None + hidden_states_13 = hidden_states_9 + hidden_states_12 + hidden_states_9 = hidden_states_12 = None + hidden_states_14 = torch.nn.functional.layer_norm( + hidden_states_13, + (768,), + l_self_modules_h_modules_2_modules_ln_1_parameters_weight_, + l_self_modules_h_modules_2_modules_ln_1_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_2_modules_ln_1_parameters_weight_ = ( + l_self_modules_h_modules_2_modules_ln_1_parameters_bias_ + ) = None + view_24 = hidden_states_14.view(-1, 768) + hidden_states_14 = None + x_16 = torch.addmm( + l_self_modules_h_modules_2_modules_attn_modules_c_attn_parameters_bias_, + view_24, + l_self_modules_h_modules_2_modules_attn_modules_c_attn_parameters_weight_, + ) + l_self_modules_h_modules_2_modules_attn_modules_c_attn_parameters_bias_ = ( + view_24 + ) = ( + l_self_modules_h_modules_2_modules_attn_modules_c_attn_parameters_weight_ + ) = None + x_17 = x_16.view((1, 2, 2304)) + x_16 = None + split_2 = x_17.split(768, dim=2) + x_17 = None + query_states_4 = split_2[0] + key_states_4 = split_2[1] + value_states_4 = split_2[2] + split_2 = None + view_26 = key_states_4.view((1, 2, -1, 64)) + key_states_4 = None + key_states_5 = view_26.transpose(1, 2) + view_26 = None + view_27 = value_states_4.view((1, 2, -1, 64)) + value_states_4 = None + value_states_5 = view_27.transpose(1, 2) + view_27 = None + view_28 = query_states_4.view((1, 2, -1, 64)) + query_states_4 = None + query_states_5 = view_28.transpose(1, 2) + view_28 = None + attention_mask_4 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_2 = query_states_5.contiguous() + query_states_5 = None + key_2 = key_states_5.contiguous() + value_2 = value_states_5.contiguous() + attn_output_8 = torch._C._nn.scaled_dot_product_attention( + query_2, + key_2, + value_2, + attn_mask=attention_mask_4, + dropout_p=0.0, + scale=None, + is_causal=False, + ) + query_2 = key_2 = value_2 = attention_mask_4 = None + transpose_11 = attn_output_8.transpose(1, 2) + attn_output_8 = None + attn_output_9 = transpose_11.contiguous() + transpose_11 = None + reshape_2 = attn_output_9.reshape(1, 2, -1) + attn_output_9 = None + attn_output_10 = reshape_2.contiguous() + reshape_2 = None + view_29 = attn_output_10.view(-1, 768) + attn_output_10 = None + x_18 = torch.addmm( + l_self_modules_h_modules_2_modules_attn_modules_c_proj_parameters_bias_, + view_29, + l_self_modules_h_modules_2_modules_attn_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_2_modules_attn_modules_c_proj_parameters_bias_ = ( + view_29 + ) = ( + l_self_modules_h_modules_2_modules_attn_modules_c_proj_parameters_weight_ + ) = None + x_19 = x_18.view((1, 2, 768)) + x_18 = None + attn_output_11 = torch.nn.functional.dropout(x_19, 0.1, False, False) + x_19 = None + hidden_states_15 = attn_output_11 + hidden_states_13 + attn_output_11 = hidden_states_13 = None + hidden_states_16 = torch.nn.functional.layer_norm( + hidden_states_15, + (768,), + l_self_modules_h_modules_2_modules_ln_2_parameters_weight_, + l_self_modules_h_modules_2_modules_ln_2_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_2_modules_ln_2_parameters_weight_ = ( + l_self_modules_h_modules_2_modules_ln_2_parameters_bias_ + ) = None + view_31 = hidden_states_16.view(-1, 768) + hidden_states_16 = None + x_20 = torch.addmm( + l_self_modules_h_modules_2_modules_mlp_modules_c_fc_parameters_bias_, + view_31, + l_self_modules_h_modules_2_modules_mlp_modules_c_fc_parameters_weight_, + ) + l_self_modules_h_modules_2_modules_mlp_modules_c_fc_parameters_bias_ = ( + view_31 + ) = ( + l_self_modules_h_modules_2_modules_mlp_modules_c_fc_parameters_weight_ + ) = None + x_21 = x_20.view((1, 2, 3072)) + x_20 = None + mul_9 = 0.5 * x_21 + pow_3 = torch.pow(x_21, 3.0) + mul_10 = 0.044715 * pow_3 + pow_3 = None + add_10 = x_21 + mul_10 + x_21 = mul_10 = None + mul_11 = 0.7978845608028654 * add_10 + add_10 = None + tanh_2 = torch.tanh(mul_11) + mul_11 = None + add_11 = 1.0 + tanh_2 + tanh_2 = None + hidden_states_17 = mul_9 * add_11 + mul_9 = add_11 = None + view_33 = hidden_states_17.view(-1, 3072) + hidden_states_17 = None + x_22 = torch.addmm( + l_self_modules_h_modules_2_modules_mlp_modules_c_proj_parameters_bias_, + view_33, + l_self_modules_h_modules_2_modules_mlp_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_2_modules_mlp_modules_c_proj_parameters_bias_ = ( + view_33 + ) = ( + l_self_modules_h_modules_2_modules_mlp_modules_c_proj_parameters_weight_ + ) = None + x_23 = x_22.view((1, 2, 768)) + x_22 = None + hidden_states_18 = torch.nn.functional.dropout(x_23, 0.1, False, False) + x_23 = None + hidden_states_19 = hidden_states_15 + hidden_states_18 + hidden_states_15 = hidden_states_18 = None + hidden_states_20 = torch.nn.functional.layer_norm( + hidden_states_19, + (768,), + l_self_modules_h_modules_3_modules_ln_1_parameters_weight_, + l_self_modules_h_modules_3_modules_ln_1_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_3_modules_ln_1_parameters_weight_ = ( + l_self_modules_h_modules_3_modules_ln_1_parameters_bias_ + ) = None + view_35 = hidden_states_20.view(-1, 768) + hidden_states_20 = None + x_24 = torch.addmm( + l_self_modules_h_modules_3_modules_attn_modules_c_attn_parameters_bias_, + view_35, + l_self_modules_h_modules_3_modules_attn_modules_c_attn_parameters_weight_, + ) + l_self_modules_h_modules_3_modules_attn_modules_c_attn_parameters_bias_ = ( + view_35 + ) = ( + l_self_modules_h_modules_3_modules_attn_modules_c_attn_parameters_weight_ + ) = None + x_25 = x_24.view((1, 2, 2304)) + x_24 = None + split_3 = x_25.split(768, dim=2) + x_25 = None + query_states_6 = split_3[0] + key_states_6 = split_3[1] + value_states_6 = split_3[2] + split_3 = None + view_37 = key_states_6.view((1, 2, -1, 64)) + key_states_6 = None + key_states_7 = view_37.transpose(1, 2) + view_37 = None + view_38 = value_states_6.view((1, 2, -1, 64)) + value_states_6 = None + value_states_7 = view_38.transpose(1, 2) + view_38 = None + view_39 = query_states_6.view((1, 2, -1, 64)) + query_states_6 = None + query_states_7 = view_39.transpose(1, 2) + view_39 = None + attention_mask_5 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_3 = query_states_7.contiguous() + query_states_7 = None + key_3 = key_states_7.contiguous() + value_3 = value_states_7.contiguous() + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_3, + key_3, + value_3, + attn_mask=attention_mask_5, + dropout_p=0.0, + scale=None, + is_causal=False, + ) + query_3 = key_3 = value_3 = attention_mask_5 = None + transpose_15 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_13 = transpose_15.contiguous() + transpose_15 = None + reshape_3 = attn_output_13.reshape(1, 2, -1) + attn_output_13 = None + attn_output_14 = reshape_3.contiguous() + reshape_3 = None + view_40 = attn_output_14.view(-1, 768) + attn_output_14 = None + x_26 = torch.addmm( + l_self_modules_h_modules_3_modules_attn_modules_c_proj_parameters_bias_, + view_40, + l_self_modules_h_modules_3_modules_attn_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_3_modules_attn_modules_c_proj_parameters_bias_ = ( + view_40 + ) = ( + l_self_modules_h_modules_3_modules_attn_modules_c_proj_parameters_weight_ + ) = None + x_27 = x_26.view((1, 2, 768)) + x_26 = None + attn_output_15 = torch.nn.functional.dropout(x_27, 0.1, False, False) + x_27 = None + hidden_states_21 = attn_output_15 + hidden_states_19 + attn_output_15 = hidden_states_19 = None + hidden_states_22 = torch.nn.functional.layer_norm( + hidden_states_21, + (768,), + l_self_modules_h_modules_3_modules_ln_2_parameters_weight_, + l_self_modules_h_modules_3_modules_ln_2_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_3_modules_ln_2_parameters_weight_ = ( + l_self_modules_h_modules_3_modules_ln_2_parameters_bias_ + ) = None + view_42 = hidden_states_22.view(-1, 768) + hidden_states_22 = None + x_28 = torch.addmm( + l_self_modules_h_modules_3_modules_mlp_modules_c_fc_parameters_bias_, + view_42, + l_self_modules_h_modules_3_modules_mlp_modules_c_fc_parameters_weight_, + ) + l_self_modules_h_modules_3_modules_mlp_modules_c_fc_parameters_bias_ = ( + view_42 + ) = ( + l_self_modules_h_modules_3_modules_mlp_modules_c_fc_parameters_weight_ + ) = None + x_29 = x_28.view((1, 2, 3072)) + x_28 = None + mul_13 = 0.5 * x_29 + pow_4 = torch.pow(x_29, 3.0) + mul_14 = 0.044715 * pow_4 + pow_4 = None + add_14 = x_29 + mul_14 + x_29 = mul_14 = None + mul_15 = 0.7978845608028654 * add_14 + add_14 = None + tanh_3 = torch.tanh(mul_15) + mul_15 = None + add_15 = 1.0 + tanh_3 + tanh_3 = None + hidden_states_23 = mul_13 * add_15 + mul_13 = add_15 = None + view_44 = hidden_states_23.view(-1, 3072) + hidden_states_23 = None + x_30 = torch.addmm( + l_self_modules_h_modules_3_modules_mlp_modules_c_proj_parameters_bias_, + view_44, + l_self_modules_h_modules_3_modules_mlp_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_3_modules_mlp_modules_c_proj_parameters_bias_ = ( + view_44 + ) = ( + l_self_modules_h_modules_3_modules_mlp_modules_c_proj_parameters_weight_ + ) = None + x_31 = x_30.view((1, 2, 768)) + x_30 = None + hidden_states_24 = torch.nn.functional.dropout(x_31, 0.1, False, False) + x_31 = None + hidden_states_25 = hidden_states_21 + hidden_states_24 + hidden_states_21 = hidden_states_24 = None + hidden_states_26 = torch.nn.functional.layer_norm( + hidden_states_25, + (768,), + l_self_modules_h_modules_4_modules_ln_1_parameters_weight_, + l_self_modules_h_modules_4_modules_ln_1_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_4_modules_ln_1_parameters_weight_ = ( + l_self_modules_h_modules_4_modules_ln_1_parameters_bias_ + ) = None + view_46 = hidden_states_26.view(-1, 768) + hidden_states_26 = None + x_32 = torch.addmm( + l_self_modules_h_modules_4_modules_attn_modules_c_attn_parameters_bias_, + view_46, + l_self_modules_h_modules_4_modules_attn_modules_c_attn_parameters_weight_, + ) + l_self_modules_h_modules_4_modules_attn_modules_c_attn_parameters_bias_ = ( + view_46 + ) = ( + l_self_modules_h_modules_4_modules_attn_modules_c_attn_parameters_weight_ + ) = None + x_33 = x_32.view((1, 2, 2304)) + x_32 = None + split_4 = x_33.split(768, dim=2) + x_33 = None + query_states_8 = split_4[0] + key_states_8 = split_4[1] + value_states_8 = split_4[2] + split_4 = None + view_48 = key_states_8.view((1, 2, -1, 64)) + key_states_8 = None + key_states_9 = view_48.transpose(1, 2) + view_48 = None + view_49 = value_states_8.view((1, 2, -1, 64)) + value_states_8 = None + value_states_9 = view_49.transpose(1, 2) + view_49 = None + view_50 = query_states_8.view((1, 2, -1, 64)) + query_states_8 = None + query_states_9 = view_50.transpose(1, 2) + view_50 = None + attention_mask_6 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_4 = query_states_9.contiguous() + query_states_9 = None + key_4 = key_states_9.contiguous() + value_4 = value_states_9.contiguous() + attn_output_16 = torch._C._nn.scaled_dot_product_attention( + query_4, + key_4, + value_4, + attn_mask=attention_mask_6, + dropout_p=0.0, + scale=None, + is_causal=False, + ) + query_4 = key_4 = value_4 = attention_mask_6 = None + transpose_19 = attn_output_16.transpose(1, 2) + attn_output_16 = None + attn_output_17 = transpose_19.contiguous() + transpose_19 = None + reshape_4 = attn_output_17.reshape(1, 2, -1) + attn_output_17 = None + attn_output_18 = reshape_4.contiguous() + reshape_4 = None + view_51 = attn_output_18.view(-1, 768) + attn_output_18 = None + x_34 = torch.addmm( + l_self_modules_h_modules_4_modules_attn_modules_c_proj_parameters_bias_, + view_51, + l_self_modules_h_modules_4_modules_attn_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_4_modules_attn_modules_c_proj_parameters_bias_ = ( + view_51 + ) = ( + l_self_modules_h_modules_4_modules_attn_modules_c_proj_parameters_weight_ + ) = None + x_35 = x_34.view((1, 2, 768)) + x_34 = None + attn_output_19 = torch.nn.functional.dropout(x_35, 0.1, False, False) + x_35 = None + hidden_states_27 = attn_output_19 + hidden_states_25 + attn_output_19 = hidden_states_25 = None + hidden_states_28 = torch.nn.functional.layer_norm( + hidden_states_27, + (768,), + l_self_modules_h_modules_4_modules_ln_2_parameters_weight_, + l_self_modules_h_modules_4_modules_ln_2_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_4_modules_ln_2_parameters_weight_ = ( + l_self_modules_h_modules_4_modules_ln_2_parameters_bias_ + ) = None + view_53 = hidden_states_28.view(-1, 768) + hidden_states_28 = None + x_36 = torch.addmm( + l_self_modules_h_modules_4_modules_mlp_modules_c_fc_parameters_bias_, + view_53, + l_self_modules_h_modules_4_modules_mlp_modules_c_fc_parameters_weight_, + ) + l_self_modules_h_modules_4_modules_mlp_modules_c_fc_parameters_bias_ = ( + view_53 + ) = ( + l_self_modules_h_modules_4_modules_mlp_modules_c_fc_parameters_weight_ + ) = None + x_37 = x_36.view((1, 2, 3072)) + x_36 = None + mul_17 = 0.5 * x_37 + pow_5 = torch.pow(x_37, 3.0) + mul_18 = 0.044715 * pow_5 + pow_5 = None + add_18 = x_37 + mul_18 + x_37 = mul_18 = None + mul_19 = 0.7978845608028654 * add_18 + add_18 = None + tanh_4 = torch.tanh(mul_19) + mul_19 = None + add_19 = 1.0 + tanh_4 + tanh_4 = None + hidden_states_29 = mul_17 * add_19 + mul_17 = add_19 = None + view_55 = hidden_states_29.view(-1, 3072) + hidden_states_29 = None + x_38 = torch.addmm( + l_self_modules_h_modules_4_modules_mlp_modules_c_proj_parameters_bias_, + view_55, + l_self_modules_h_modules_4_modules_mlp_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_4_modules_mlp_modules_c_proj_parameters_bias_ = ( + view_55 + ) = ( + l_self_modules_h_modules_4_modules_mlp_modules_c_proj_parameters_weight_ + ) = None + x_39 = x_38.view((1, 2, 768)) + x_38 = None + hidden_states_30 = torch.nn.functional.dropout(x_39, 0.1, False, False) + x_39 = None + hidden_states_31 = hidden_states_27 + hidden_states_30 + hidden_states_27 = hidden_states_30 = None + hidden_states_32 = torch.nn.functional.layer_norm( + hidden_states_31, + (768,), + l_self_modules_h_modules_5_modules_ln_1_parameters_weight_, + l_self_modules_h_modules_5_modules_ln_1_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_5_modules_ln_1_parameters_weight_ = ( + l_self_modules_h_modules_5_modules_ln_1_parameters_bias_ + ) = None + view_57 = hidden_states_32.view(-1, 768) + hidden_states_32 = None + x_40 = torch.addmm( + l_self_modules_h_modules_5_modules_attn_modules_c_attn_parameters_bias_, + view_57, + l_self_modules_h_modules_5_modules_attn_modules_c_attn_parameters_weight_, + ) + l_self_modules_h_modules_5_modules_attn_modules_c_attn_parameters_bias_ = ( + view_57 + ) = ( + l_self_modules_h_modules_5_modules_attn_modules_c_attn_parameters_weight_ + ) = None + x_41 = x_40.view((1, 2, 2304)) + x_40 = None + split_5 = x_41.split(768, dim=2) + x_41 = None + query_states_10 = split_5[0] + key_states_10 = split_5[1] + value_states_10 = split_5[2] + split_5 = None + view_59 = key_states_10.view((1, 2, -1, 64)) + key_states_10 = None + key_states_11 = view_59.transpose(1, 2) + view_59 = None + view_60 = value_states_10.view((1, 2, -1, 64)) + value_states_10 = None + value_states_11 = view_60.transpose(1, 2) + view_60 = None + view_61 = query_states_10.view((1, 2, -1, 64)) + query_states_10 = None + query_states_11 = view_61.transpose(1, 2) + view_61 = None + attention_mask_7 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_5 = query_states_11.contiguous() + query_states_11 = None + key_5 = key_states_11.contiguous() + value_5 = value_states_11.contiguous() + attn_output_20 = torch._C._nn.scaled_dot_product_attention( + query_5, + key_5, + value_5, + attn_mask=attention_mask_7, + dropout_p=0.0, + scale=None, + is_causal=False, + ) + query_5 = key_5 = value_5 = attention_mask_7 = None + transpose_23 = attn_output_20.transpose(1, 2) + attn_output_20 = None + attn_output_21 = transpose_23.contiguous() + transpose_23 = None + reshape_5 = attn_output_21.reshape(1, 2, -1) + attn_output_21 = None + attn_output_22 = reshape_5.contiguous() + reshape_5 = None + view_62 = attn_output_22.view(-1, 768) + attn_output_22 = None + x_42 = torch.addmm( + l_self_modules_h_modules_5_modules_attn_modules_c_proj_parameters_bias_, + view_62, + l_self_modules_h_modules_5_modules_attn_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_5_modules_attn_modules_c_proj_parameters_bias_ = ( + view_62 + ) = ( + l_self_modules_h_modules_5_modules_attn_modules_c_proj_parameters_weight_ + ) = None + x_43 = x_42.view((1, 2, 768)) + x_42 = None + attn_output_23 = torch.nn.functional.dropout(x_43, 0.1, False, False) + x_43 = None + hidden_states_33 = attn_output_23 + hidden_states_31 + attn_output_23 = hidden_states_31 = None + hidden_states_34 = torch.nn.functional.layer_norm( + hidden_states_33, + (768,), + l_self_modules_h_modules_5_modules_ln_2_parameters_weight_, + l_self_modules_h_modules_5_modules_ln_2_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_5_modules_ln_2_parameters_weight_ = ( + l_self_modules_h_modules_5_modules_ln_2_parameters_bias_ + ) = None + view_64 = hidden_states_34.view(-1, 768) + hidden_states_34 = None + x_44 = torch.addmm( + l_self_modules_h_modules_5_modules_mlp_modules_c_fc_parameters_bias_, + view_64, + l_self_modules_h_modules_5_modules_mlp_modules_c_fc_parameters_weight_, + ) + l_self_modules_h_modules_5_modules_mlp_modules_c_fc_parameters_bias_ = ( + view_64 + ) = ( + l_self_modules_h_modules_5_modules_mlp_modules_c_fc_parameters_weight_ + ) = None + x_45 = x_44.view((1, 2, 3072)) + x_44 = None + mul_21 = 0.5 * x_45 + pow_6 = torch.pow(x_45, 3.0) + mul_22 = 0.044715 * pow_6 + pow_6 = None + add_22 = x_45 + mul_22 + x_45 = mul_22 = None + mul_23 = 0.7978845608028654 * add_22 + add_22 = None + tanh_5 = torch.tanh(mul_23) + mul_23 = None + add_23 = 1.0 + tanh_5 + tanh_5 = None + hidden_states_35 = mul_21 * add_23 + mul_21 = add_23 = None + view_66 = hidden_states_35.view(-1, 3072) + hidden_states_35 = None + x_46 = torch.addmm( + l_self_modules_h_modules_5_modules_mlp_modules_c_proj_parameters_bias_, + view_66, + l_self_modules_h_modules_5_modules_mlp_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_5_modules_mlp_modules_c_proj_parameters_bias_ = ( + view_66 + ) = ( + l_self_modules_h_modules_5_modules_mlp_modules_c_proj_parameters_weight_ + ) = None + x_47 = x_46.view((1, 2, 768)) + x_46 = None + hidden_states_36 = torch.nn.functional.dropout(x_47, 0.1, False, False) + x_47 = None + hidden_states_37 = hidden_states_33 + hidden_states_36 + hidden_states_33 = hidden_states_36 = None + hidden_states_38 = torch.nn.functional.layer_norm( + hidden_states_37, + (768,), + l_self_modules_h_modules_6_modules_ln_1_parameters_weight_, + l_self_modules_h_modules_6_modules_ln_1_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_6_modules_ln_1_parameters_weight_ = ( + l_self_modules_h_modules_6_modules_ln_1_parameters_bias_ + ) = None + view_68 = hidden_states_38.view(-1, 768) + hidden_states_38 = None + x_48 = torch.addmm( + l_self_modules_h_modules_6_modules_attn_modules_c_attn_parameters_bias_, + view_68, + l_self_modules_h_modules_6_modules_attn_modules_c_attn_parameters_weight_, + ) + l_self_modules_h_modules_6_modules_attn_modules_c_attn_parameters_bias_ = ( + view_68 + ) = ( + l_self_modules_h_modules_6_modules_attn_modules_c_attn_parameters_weight_ + ) = None + x_49 = x_48.view((1, 2, 2304)) + x_48 = None + split_6 = x_49.split(768, dim=2) + x_49 = None + query_states_12 = split_6[0] + key_states_12 = split_6[1] + value_states_12 = split_6[2] + split_6 = None + view_70 = key_states_12.view((1, 2, -1, 64)) + key_states_12 = None + key_states_13 = view_70.transpose(1, 2) + view_70 = None + view_71 = value_states_12.view((1, 2, -1, 64)) + value_states_12 = None + value_states_13 = view_71.transpose(1, 2) + view_71 = None + view_72 = query_states_12.view((1, 2, -1, 64)) + query_states_12 = None + query_states_13 = view_72.transpose(1, 2) + view_72 = None + attention_mask_8 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_6 = query_states_13.contiguous() + query_states_13 = None + key_6 = key_states_13.contiguous() + value_6 = value_states_13.contiguous() + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + query_6, + key_6, + value_6, + attn_mask=attention_mask_8, + dropout_p=0.0, + scale=None, + is_causal=False, + ) + query_6 = key_6 = value_6 = attention_mask_8 = None + transpose_27 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_25 = transpose_27.contiguous() + transpose_27 = None + reshape_6 = attn_output_25.reshape(1, 2, -1) + attn_output_25 = None + attn_output_26 = reshape_6.contiguous() + reshape_6 = None + view_73 = attn_output_26.view(-1, 768) + attn_output_26 = None + x_50 = torch.addmm( + l_self_modules_h_modules_6_modules_attn_modules_c_proj_parameters_bias_, + view_73, + l_self_modules_h_modules_6_modules_attn_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_6_modules_attn_modules_c_proj_parameters_bias_ = ( + view_73 + ) = ( + l_self_modules_h_modules_6_modules_attn_modules_c_proj_parameters_weight_ + ) = None + x_51 = x_50.view((1, 2, 768)) + x_50 = None + attn_output_27 = torch.nn.functional.dropout(x_51, 0.1, False, False) + x_51 = None + hidden_states_39 = attn_output_27 + hidden_states_37 + attn_output_27 = hidden_states_37 = None + hidden_states_40 = torch.nn.functional.layer_norm( + hidden_states_39, + (768,), + l_self_modules_h_modules_6_modules_ln_2_parameters_weight_, + l_self_modules_h_modules_6_modules_ln_2_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_6_modules_ln_2_parameters_weight_ = ( + l_self_modules_h_modules_6_modules_ln_2_parameters_bias_ + ) = None + view_75 = hidden_states_40.view(-1, 768) + hidden_states_40 = None + x_52 = torch.addmm( + l_self_modules_h_modules_6_modules_mlp_modules_c_fc_parameters_bias_, + view_75, + l_self_modules_h_modules_6_modules_mlp_modules_c_fc_parameters_weight_, + ) + l_self_modules_h_modules_6_modules_mlp_modules_c_fc_parameters_bias_ = ( + view_75 + ) = ( + l_self_modules_h_modules_6_modules_mlp_modules_c_fc_parameters_weight_ + ) = None + x_53 = x_52.view((1, 2, 3072)) + x_52 = None + mul_25 = 0.5 * x_53 + pow_7 = torch.pow(x_53, 3.0) + mul_26 = 0.044715 * pow_7 + pow_7 = None + add_26 = x_53 + mul_26 + x_53 = mul_26 = None + mul_27 = 0.7978845608028654 * add_26 + add_26 = None + tanh_6 = torch.tanh(mul_27) + mul_27 = None + add_27 = 1.0 + tanh_6 + tanh_6 = None + hidden_states_41 = mul_25 * add_27 + mul_25 = add_27 = None + view_77 = hidden_states_41.view(-1, 3072) + hidden_states_41 = None + x_54 = torch.addmm( + l_self_modules_h_modules_6_modules_mlp_modules_c_proj_parameters_bias_, + view_77, + l_self_modules_h_modules_6_modules_mlp_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_6_modules_mlp_modules_c_proj_parameters_bias_ = ( + view_77 + ) = ( + l_self_modules_h_modules_6_modules_mlp_modules_c_proj_parameters_weight_ + ) = None + x_55 = x_54.view((1, 2, 768)) + x_54 = None + hidden_states_42 = torch.nn.functional.dropout(x_55, 0.1, False, False) + x_55 = None + hidden_states_43 = hidden_states_39 + hidden_states_42 + hidden_states_39 = hidden_states_42 = None + hidden_states_44 = torch.nn.functional.layer_norm( + hidden_states_43, + (768,), + l_self_modules_h_modules_7_modules_ln_1_parameters_weight_, + l_self_modules_h_modules_7_modules_ln_1_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_7_modules_ln_1_parameters_weight_ = ( + l_self_modules_h_modules_7_modules_ln_1_parameters_bias_ + ) = None + view_79 = hidden_states_44.view(-1, 768) + hidden_states_44 = None + x_56 = torch.addmm( + l_self_modules_h_modules_7_modules_attn_modules_c_attn_parameters_bias_, + view_79, + l_self_modules_h_modules_7_modules_attn_modules_c_attn_parameters_weight_, + ) + l_self_modules_h_modules_7_modules_attn_modules_c_attn_parameters_bias_ = ( + view_79 + ) = ( + l_self_modules_h_modules_7_modules_attn_modules_c_attn_parameters_weight_ + ) = None + x_57 = x_56.view((1, 2, 2304)) + x_56 = None + split_7 = x_57.split(768, dim=2) + x_57 = None + query_states_14 = split_7[0] + key_states_14 = split_7[1] + value_states_14 = split_7[2] + split_7 = None + view_81 = key_states_14.view((1, 2, -1, 64)) + key_states_14 = None + key_states_15 = view_81.transpose(1, 2) + view_81 = None + view_82 = value_states_14.view((1, 2, -1, 64)) + value_states_14 = None + value_states_15 = view_82.transpose(1, 2) + view_82 = None + view_83 = query_states_14.view((1, 2, -1, 64)) + query_states_14 = None + query_states_15 = view_83.transpose(1, 2) + view_83 = None + attention_mask_9 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_7 = query_states_15.contiguous() + query_states_15 = None + key_7 = key_states_15.contiguous() + value_7 = value_states_15.contiguous() + attn_output_28 = torch._C._nn.scaled_dot_product_attention( + query_7, + key_7, + value_7, + attn_mask=attention_mask_9, + dropout_p=0.0, + scale=None, + is_causal=False, + ) + query_7 = key_7 = value_7 = attention_mask_9 = None + transpose_31 = attn_output_28.transpose(1, 2) + attn_output_28 = None + attn_output_29 = transpose_31.contiguous() + transpose_31 = None + reshape_7 = attn_output_29.reshape(1, 2, -1) + attn_output_29 = None + attn_output_30 = reshape_7.contiguous() + reshape_7 = None + view_84 = attn_output_30.view(-1, 768) + attn_output_30 = None + x_58 = torch.addmm( + l_self_modules_h_modules_7_modules_attn_modules_c_proj_parameters_bias_, + view_84, + l_self_modules_h_modules_7_modules_attn_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_7_modules_attn_modules_c_proj_parameters_bias_ = ( + view_84 + ) = ( + l_self_modules_h_modules_7_modules_attn_modules_c_proj_parameters_weight_ + ) = None + x_59 = x_58.view((1, 2, 768)) + x_58 = None + attn_output_31 = torch.nn.functional.dropout(x_59, 0.1, False, False) + x_59 = None + hidden_states_45 = attn_output_31 + hidden_states_43 + attn_output_31 = hidden_states_43 = None + hidden_states_46 = torch.nn.functional.layer_norm( + hidden_states_45, + (768,), + l_self_modules_h_modules_7_modules_ln_2_parameters_weight_, + l_self_modules_h_modules_7_modules_ln_2_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_7_modules_ln_2_parameters_weight_ = ( + l_self_modules_h_modules_7_modules_ln_2_parameters_bias_ + ) = None + view_86 = hidden_states_46.view(-1, 768) + hidden_states_46 = None + x_60 = torch.addmm( + l_self_modules_h_modules_7_modules_mlp_modules_c_fc_parameters_bias_, + view_86, + l_self_modules_h_modules_7_modules_mlp_modules_c_fc_parameters_weight_, + ) + l_self_modules_h_modules_7_modules_mlp_modules_c_fc_parameters_bias_ = ( + view_86 + ) = ( + l_self_modules_h_modules_7_modules_mlp_modules_c_fc_parameters_weight_ + ) = None + x_61 = x_60.view((1, 2, 3072)) + x_60 = None + mul_29 = 0.5 * x_61 + pow_8 = torch.pow(x_61, 3.0) + mul_30 = 0.044715 * pow_8 + pow_8 = None + add_30 = x_61 + mul_30 + x_61 = mul_30 = None + mul_31 = 0.7978845608028654 * add_30 + add_30 = None + tanh_7 = torch.tanh(mul_31) + mul_31 = None + add_31 = 1.0 + tanh_7 + tanh_7 = None + hidden_states_47 = mul_29 * add_31 + mul_29 = add_31 = None + view_88 = hidden_states_47.view(-1, 3072) + hidden_states_47 = None + x_62 = torch.addmm( + l_self_modules_h_modules_7_modules_mlp_modules_c_proj_parameters_bias_, + view_88, + l_self_modules_h_modules_7_modules_mlp_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_7_modules_mlp_modules_c_proj_parameters_bias_ = ( + view_88 + ) = ( + l_self_modules_h_modules_7_modules_mlp_modules_c_proj_parameters_weight_ + ) = None + x_63 = x_62.view((1, 2, 768)) + x_62 = None + hidden_states_48 = torch.nn.functional.dropout(x_63, 0.1, False, False) + x_63 = None + hidden_states_49 = hidden_states_45 + hidden_states_48 + hidden_states_45 = hidden_states_48 = None + hidden_states_50 = torch.nn.functional.layer_norm( + hidden_states_49, + (768,), + l_self_modules_h_modules_8_modules_ln_1_parameters_weight_, + l_self_modules_h_modules_8_modules_ln_1_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_8_modules_ln_1_parameters_weight_ = ( + l_self_modules_h_modules_8_modules_ln_1_parameters_bias_ + ) = None + view_90 = hidden_states_50.view(-1, 768) + hidden_states_50 = None + x_64 = torch.addmm( + l_self_modules_h_modules_8_modules_attn_modules_c_attn_parameters_bias_, + view_90, + l_self_modules_h_modules_8_modules_attn_modules_c_attn_parameters_weight_, + ) + l_self_modules_h_modules_8_modules_attn_modules_c_attn_parameters_bias_ = ( + view_90 + ) = ( + l_self_modules_h_modules_8_modules_attn_modules_c_attn_parameters_weight_ + ) = None + x_65 = x_64.view((1, 2, 2304)) + x_64 = None + split_8 = x_65.split(768, dim=2) + x_65 = None + query_states_16 = split_8[0] + key_states_16 = split_8[1] + value_states_16 = split_8[2] + split_8 = None + view_92 = key_states_16.view((1, 2, -1, 64)) + key_states_16 = None + key_states_17 = view_92.transpose(1, 2) + view_92 = None + view_93 = value_states_16.view((1, 2, -1, 64)) + value_states_16 = None + value_states_17 = view_93.transpose(1, 2) + view_93 = None + view_94 = query_states_16.view((1, 2, -1, 64)) + query_states_16 = None + query_states_17 = view_94.transpose(1, 2) + view_94 = None + attention_mask_10 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_8 = query_states_17.contiguous() + query_states_17 = None + key_8 = key_states_17.contiguous() + value_8 = value_states_17.contiguous() + attn_output_32 = torch._C._nn.scaled_dot_product_attention( + query_8, + key_8, + value_8, + attn_mask=attention_mask_10, + dropout_p=0.0, + scale=None, + is_causal=False, + ) + query_8 = key_8 = value_8 = attention_mask_10 = None + transpose_35 = attn_output_32.transpose(1, 2) + attn_output_32 = None + attn_output_33 = transpose_35.contiguous() + transpose_35 = None + reshape_8 = attn_output_33.reshape(1, 2, -1) + attn_output_33 = None + attn_output_34 = reshape_8.contiguous() + reshape_8 = None + view_95 = attn_output_34.view(-1, 768) + attn_output_34 = None + x_66 = torch.addmm( + l_self_modules_h_modules_8_modules_attn_modules_c_proj_parameters_bias_, + view_95, + l_self_modules_h_modules_8_modules_attn_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_8_modules_attn_modules_c_proj_parameters_bias_ = ( + view_95 + ) = ( + l_self_modules_h_modules_8_modules_attn_modules_c_proj_parameters_weight_ + ) = None + x_67 = x_66.view((1, 2, 768)) + x_66 = None + attn_output_35 = torch.nn.functional.dropout(x_67, 0.1, False, False) + x_67 = None + hidden_states_51 = attn_output_35 + hidden_states_49 + attn_output_35 = hidden_states_49 = None + hidden_states_52 = torch.nn.functional.layer_norm( + hidden_states_51, + (768,), + l_self_modules_h_modules_8_modules_ln_2_parameters_weight_, + l_self_modules_h_modules_8_modules_ln_2_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_8_modules_ln_2_parameters_weight_ = ( + l_self_modules_h_modules_8_modules_ln_2_parameters_bias_ + ) = None + view_97 = hidden_states_52.view(-1, 768) + hidden_states_52 = None + x_68 = torch.addmm( + l_self_modules_h_modules_8_modules_mlp_modules_c_fc_parameters_bias_, + view_97, + l_self_modules_h_modules_8_modules_mlp_modules_c_fc_parameters_weight_, + ) + l_self_modules_h_modules_8_modules_mlp_modules_c_fc_parameters_bias_ = ( + view_97 + ) = ( + l_self_modules_h_modules_8_modules_mlp_modules_c_fc_parameters_weight_ + ) = None + x_69 = x_68.view((1, 2, 3072)) + x_68 = None + mul_33 = 0.5 * x_69 + pow_9 = torch.pow(x_69, 3.0) + mul_34 = 0.044715 * pow_9 + pow_9 = None + add_34 = x_69 + mul_34 + x_69 = mul_34 = None + mul_35 = 0.7978845608028654 * add_34 + add_34 = None + tanh_8 = torch.tanh(mul_35) + mul_35 = None + add_35 = 1.0 + tanh_8 + tanh_8 = None + hidden_states_53 = mul_33 * add_35 + mul_33 = add_35 = None + view_99 = hidden_states_53.view(-1, 3072) + hidden_states_53 = None + x_70 = torch.addmm( + l_self_modules_h_modules_8_modules_mlp_modules_c_proj_parameters_bias_, + view_99, + l_self_modules_h_modules_8_modules_mlp_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_8_modules_mlp_modules_c_proj_parameters_bias_ = ( + view_99 + ) = ( + l_self_modules_h_modules_8_modules_mlp_modules_c_proj_parameters_weight_ + ) = None + x_71 = x_70.view((1, 2, 768)) + x_70 = None + hidden_states_54 = torch.nn.functional.dropout(x_71, 0.1, False, False) + x_71 = None + hidden_states_55 = hidden_states_51 + hidden_states_54 + hidden_states_51 = hidden_states_54 = None + hidden_states_56 = torch.nn.functional.layer_norm( + hidden_states_55, + (768,), + l_self_modules_h_modules_9_modules_ln_1_parameters_weight_, + l_self_modules_h_modules_9_modules_ln_1_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_9_modules_ln_1_parameters_weight_ = ( + l_self_modules_h_modules_9_modules_ln_1_parameters_bias_ + ) = None + view_101 = hidden_states_56.view(-1, 768) + hidden_states_56 = None + x_72 = torch.addmm( + l_self_modules_h_modules_9_modules_attn_modules_c_attn_parameters_bias_, + view_101, + l_self_modules_h_modules_9_modules_attn_modules_c_attn_parameters_weight_, + ) + l_self_modules_h_modules_9_modules_attn_modules_c_attn_parameters_bias_ = ( + view_101 + ) = ( + l_self_modules_h_modules_9_modules_attn_modules_c_attn_parameters_weight_ + ) = None + x_73 = x_72.view((1, 2, 2304)) + x_72 = None + split_9 = x_73.split(768, dim=2) + x_73 = None + query_states_18 = split_9[0] + key_states_18 = split_9[1] + value_states_18 = split_9[2] + split_9 = None + view_103 = key_states_18.view((1, 2, -1, 64)) + key_states_18 = None + key_states_19 = view_103.transpose(1, 2) + view_103 = None + view_104 = value_states_18.view((1, 2, -1, 64)) + value_states_18 = None + value_states_19 = view_104.transpose(1, 2) + view_104 = None + view_105 = query_states_18.view((1, 2, -1, 64)) + query_states_18 = None + query_states_19 = view_105.transpose(1, 2) + view_105 = None + attention_mask_11 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_9 = query_states_19.contiguous() + query_states_19 = None + key_9 = key_states_19.contiguous() + value_9 = value_states_19.contiguous() + attn_output_36 = torch._C._nn.scaled_dot_product_attention( + query_9, + key_9, + value_9, + attn_mask=attention_mask_11, + dropout_p=0.0, + scale=None, + is_causal=False, + ) + query_9 = key_9 = value_9 = attention_mask_11 = None + transpose_39 = attn_output_36.transpose(1, 2) + attn_output_36 = None + attn_output_37 = transpose_39.contiguous() + transpose_39 = None + reshape_9 = attn_output_37.reshape(1, 2, -1) + attn_output_37 = None + attn_output_38 = reshape_9.contiguous() + reshape_9 = None + view_106 = attn_output_38.view(-1, 768) + attn_output_38 = None + x_74 = torch.addmm( + l_self_modules_h_modules_9_modules_attn_modules_c_proj_parameters_bias_, + view_106, + l_self_modules_h_modules_9_modules_attn_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_9_modules_attn_modules_c_proj_parameters_bias_ = ( + view_106 + ) = ( + l_self_modules_h_modules_9_modules_attn_modules_c_proj_parameters_weight_ + ) = None + x_75 = x_74.view((1, 2, 768)) + x_74 = None + attn_output_39 = torch.nn.functional.dropout(x_75, 0.1, False, False) + x_75 = None + hidden_states_57 = attn_output_39 + hidden_states_55 + attn_output_39 = hidden_states_55 = None + hidden_states_58 = torch.nn.functional.layer_norm( + hidden_states_57, + (768,), + l_self_modules_h_modules_9_modules_ln_2_parameters_weight_, + l_self_modules_h_modules_9_modules_ln_2_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_9_modules_ln_2_parameters_weight_ = ( + l_self_modules_h_modules_9_modules_ln_2_parameters_bias_ + ) = None + view_108 = hidden_states_58.view(-1, 768) + hidden_states_58 = None + x_76 = torch.addmm( + l_self_modules_h_modules_9_modules_mlp_modules_c_fc_parameters_bias_, + view_108, + l_self_modules_h_modules_9_modules_mlp_modules_c_fc_parameters_weight_, + ) + l_self_modules_h_modules_9_modules_mlp_modules_c_fc_parameters_bias_ = ( + view_108 + ) = ( + l_self_modules_h_modules_9_modules_mlp_modules_c_fc_parameters_weight_ + ) = None + x_77 = x_76.view((1, 2, 3072)) + x_76 = None + mul_37 = 0.5 * x_77 + pow_10 = torch.pow(x_77, 3.0) + mul_38 = 0.044715 * pow_10 + pow_10 = None + add_38 = x_77 + mul_38 + x_77 = mul_38 = None + mul_39 = 0.7978845608028654 * add_38 + add_38 = None + tanh_9 = torch.tanh(mul_39) + mul_39 = None + add_39 = 1.0 + tanh_9 + tanh_9 = None + hidden_states_59 = mul_37 * add_39 + mul_37 = add_39 = None + view_110 = hidden_states_59.view(-1, 3072) + hidden_states_59 = None + x_78 = torch.addmm( + l_self_modules_h_modules_9_modules_mlp_modules_c_proj_parameters_bias_, + view_110, + l_self_modules_h_modules_9_modules_mlp_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_9_modules_mlp_modules_c_proj_parameters_bias_ = ( + view_110 + ) = ( + l_self_modules_h_modules_9_modules_mlp_modules_c_proj_parameters_weight_ + ) = None + x_79 = x_78.view((1, 2, 768)) + x_78 = None + hidden_states_60 = torch.nn.functional.dropout(x_79, 0.1, False, False) + x_79 = None + hidden_states_61 = hidden_states_57 + hidden_states_60 + hidden_states_57 = hidden_states_60 = None + hidden_states_62 = torch.nn.functional.layer_norm( + hidden_states_61, + (768,), + l_self_modules_h_modules_10_modules_ln_1_parameters_weight_, + l_self_modules_h_modules_10_modules_ln_1_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_10_modules_ln_1_parameters_weight_ = ( + l_self_modules_h_modules_10_modules_ln_1_parameters_bias_ + ) = None + view_112 = hidden_states_62.view(-1, 768) + hidden_states_62 = None + x_80 = torch.addmm( + l_self_modules_h_modules_10_modules_attn_modules_c_attn_parameters_bias_, + view_112, + l_self_modules_h_modules_10_modules_attn_modules_c_attn_parameters_weight_, + ) + l_self_modules_h_modules_10_modules_attn_modules_c_attn_parameters_bias_ = ( + view_112 + ) = ( + l_self_modules_h_modules_10_modules_attn_modules_c_attn_parameters_weight_ + ) = None + x_81 = x_80.view((1, 2, 2304)) + x_80 = None + split_10 = x_81.split(768, dim=2) + x_81 = None + query_states_20 = split_10[0] + key_states_20 = split_10[1] + value_states_20 = split_10[2] + split_10 = None + view_114 = key_states_20.view((1, 2, -1, 64)) + key_states_20 = None + key_states_21 = view_114.transpose(1, 2) + view_114 = None + view_115 = value_states_20.view((1, 2, -1, 64)) + value_states_20 = None + value_states_21 = view_115.transpose(1, 2) + view_115 = None + view_116 = query_states_20.view((1, 2, -1, 64)) + query_states_20 = None + query_states_21 = view_116.transpose(1, 2) + view_116 = None + attention_mask_12 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_10 = query_states_21.contiguous() + query_states_21 = None + key_10 = key_states_21.contiguous() + value_10 = value_states_21.contiguous() + attn_output_40 = torch._C._nn.scaled_dot_product_attention( + query_10, + key_10, + value_10, + attn_mask=attention_mask_12, + dropout_p=0.0, + scale=None, + is_causal=False, + ) + query_10 = key_10 = value_10 = attention_mask_12 = None + transpose_43 = attn_output_40.transpose(1, 2) + attn_output_40 = None + attn_output_41 = transpose_43.contiguous() + transpose_43 = None + reshape_10 = attn_output_41.reshape(1, 2, -1) + attn_output_41 = None + attn_output_42 = reshape_10.contiguous() + reshape_10 = None + view_117 = attn_output_42.view(-1, 768) + attn_output_42 = None + x_82 = torch.addmm( + l_self_modules_h_modules_10_modules_attn_modules_c_proj_parameters_bias_, + view_117, + l_self_modules_h_modules_10_modules_attn_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_10_modules_attn_modules_c_proj_parameters_bias_ = ( + view_117 + ) = ( + l_self_modules_h_modules_10_modules_attn_modules_c_proj_parameters_weight_ + ) = None + x_83 = x_82.view((1, 2, 768)) + x_82 = None + attn_output_43 = torch.nn.functional.dropout(x_83, 0.1, False, False) + x_83 = None + hidden_states_63 = attn_output_43 + hidden_states_61 + attn_output_43 = hidden_states_61 = None + hidden_states_64 = torch.nn.functional.layer_norm( + hidden_states_63, + (768,), + l_self_modules_h_modules_10_modules_ln_2_parameters_weight_, + l_self_modules_h_modules_10_modules_ln_2_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_10_modules_ln_2_parameters_weight_ = ( + l_self_modules_h_modules_10_modules_ln_2_parameters_bias_ + ) = None + view_119 = hidden_states_64.view(-1, 768) + hidden_states_64 = None + x_84 = torch.addmm( + l_self_modules_h_modules_10_modules_mlp_modules_c_fc_parameters_bias_, + view_119, + l_self_modules_h_modules_10_modules_mlp_modules_c_fc_parameters_weight_, + ) + l_self_modules_h_modules_10_modules_mlp_modules_c_fc_parameters_bias_ = ( + view_119 + ) = ( + l_self_modules_h_modules_10_modules_mlp_modules_c_fc_parameters_weight_ + ) = None + x_85 = x_84.view((1, 2, 3072)) + x_84 = None + mul_41 = 0.5 * x_85 + pow_11 = torch.pow(x_85, 3.0) + mul_42 = 0.044715 * pow_11 + pow_11 = None + add_42 = x_85 + mul_42 + x_85 = mul_42 = None + mul_43 = 0.7978845608028654 * add_42 + add_42 = None + tanh_10 = torch.tanh(mul_43) + mul_43 = None + add_43 = 1.0 + tanh_10 + tanh_10 = None + hidden_states_65 = mul_41 * add_43 + mul_41 = add_43 = None + view_121 = hidden_states_65.view(-1, 3072) + hidden_states_65 = None + x_86 = torch.addmm( + l_self_modules_h_modules_10_modules_mlp_modules_c_proj_parameters_bias_, + view_121, + l_self_modules_h_modules_10_modules_mlp_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_10_modules_mlp_modules_c_proj_parameters_bias_ = ( + view_121 + ) = ( + l_self_modules_h_modules_10_modules_mlp_modules_c_proj_parameters_weight_ + ) = None + x_87 = x_86.view((1, 2, 768)) + x_86 = None + hidden_states_66 = torch.nn.functional.dropout(x_87, 0.1, False, False) + x_87 = None + hidden_states_67 = hidden_states_63 + hidden_states_66 + hidden_states_63 = hidden_states_66 = None + hidden_states_68 = torch.nn.functional.layer_norm( + hidden_states_67, + (768,), + l_self_modules_h_modules_11_modules_ln_1_parameters_weight_, + l_self_modules_h_modules_11_modules_ln_1_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_11_modules_ln_1_parameters_weight_ = ( + l_self_modules_h_modules_11_modules_ln_1_parameters_bias_ + ) = None + view_123 = hidden_states_68.view(-1, 768) + hidden_states_68 = None + x_88 = torch.addmm( + l_self_modules_h_modules_11_modules_attn_modules_c_attn_parameters_bias_, + view_123, + l_self_modules_h_modules_11_modules_attn_modules_c_attn_parameters_weight_, + ) + l_self_modules_h_modules_11_modules_attn_modules_c_attn_parameters_bias_ = ( + view_123 + ) = ( + l_self_modules_h_modules_11_modules_attn_modules_c_attn_parameters_weight_ + ) = None + x_89 = x_88.view((1, 2, 2304)) + x_88 = None + split_11 = x_89.split(768, dim=2) + x_89 = None + query_states_22 = split_11[0] + key_states_22 = split_11[1] + value_states_22 = split_11[2] + split_11 = None + view_125 = key_states_22.view((1, 2, -1, 64)) + key_states_22 = None + key_states_23 = view_125.transpose(1, 2) + view_125 = None + view_126 = value_states_22.view((1, 2, -1, 64)) + value_states_22 = None + value_states_23 = view_126.transpose(1, 2) + view_126 = None + view_127 = query_states_22.view((1, 2, -1, 64)) + query_states_22 = None + query_states_23 = view_127.transpose(1, 2) + view_127 = None + attention_mask_13 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + causal_mask_2 = None + query_11 = query_states_23.contiguous() + query_states_23 = None + key_11 = key_states_23.contiguous() + value_11 = value_states_23.contiguous() + attn_output_44 = torch._C._nn.scaled_dot_product_attention( + query_11, + key_11, + value_11, + attn_mask=attention_mask_13, + dropout_p=0.0, + scale=None, + is_causal=False, + ) + query_11 = key_11 = value_11 = attention_mask_13 = None + transpose_47 = attn_output_44.transpose(1, 2) + attn_output_44 = None + attn_output_45 = transpose_47.contiguous() + transpose_47 = None + reshape_11 = attn_output_45.reshape(1, 2, -1) + attn_output_45 = None + attn_output_46 = reshape_11.contiguous() + reshape_11 = None + view_128 = attn_output_46.view(-1, 768) + attn_output_46 = None + x_90 = torch.addmm( + l_self_modules_h_modules_11_modules_attn_modules_c_proj_parameters_bias_, + view_128, + l_self_modules_h_modules_11_modules_attn_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_11_modules_attn_modules_c_proj_parameters_bias_ = ( + view_128 + ) = ( + l_self_modules_h_modules_11_modules_attn_modules_c_proj_parameters_weight_ + ) = None + x_91 = x_90.view((1, 2, 768)) + x_90 = None + attn_output_47 = torch.nn.functional.dropout(x_91, 0.1, False, False) + x_91 = None + hidden_states_69 = attn_output_47 + hidden_states_67 + attn_output_47 = hidden_states_67 = None + hidden_states_70 = torch.nn.functional.layer_norm( + hidden_states_69, + (768,), + l_self_modules_h_modules_11_modules_ln_2_parameters_weight_, + l_self_modules_h_modules_11_modules_ln_2_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_11_modules_ln_2_parameters_weight_ = ( + l_self_modules_h_modules_11_modules_ln_2_parameters_bias_ + ) = None + view_130 = hidden_states_70.view(-1, 768) + hidden_states_70 = None + x_92 = torch.addmm( + l_self_modules_h_modules_11_modules_mlp_modules_c_fc_parameters_bias_, + view_130, + l_self_modules_h_modules_11_modules_mlp_modules_c_fc_parameters_weight_, + ) + l_self_modules_h_modules_11_modules_mlp_modules_c_fc_parameters_bias_ = ( + view_130 + ) = ( + l_self_modules_h_modules_11_modules_mlp_modules_c_fc_parameters_weight_ + ) = None + x_93 = x_92.view((1, 2, 3072)) + x_92 = None + mul_45 = 0.5 * x_93 + pow_12 = torch.pow(x_93, 3.0) + mul_46 = 0.044715 * pow_12 + pow_12 = None + add_46 = x_93 + mul_46 + x_93 = mul_46 = None + mul_47 = 0.7978845608028654 * add_46 + add_46 = None + tanh_11 = torch.tanh(mul_47) + mul_47 = None + add_47 = 1.0 + tanh_11 + tanh_11 = None + hidden_states_71 = mul_45 * add_47 + mul_45 = add_47 = None + view_132 = hidden_states_71.view(-1, 3072) + hidden_states_71 = None + x_94 = torch.addmm( + l_self_modules_h_modules_11_modules_mlp_modules_c_proj_parameters_bias_, + view_132, + l_self_modules_h_modules_11_modules_mlp_modules_c_proj_parameters_weight_, + ) + l_self_modules_h_modules_11_modules_mlp_modules_c_proj_parameters_bias_ = ( + view_132 + ) = ( + l_self_modules_h_modules_11_modules_mlp_modules_c_proj_parameters_weight_ + ) = None + x_95 = x_94.view((1, 2, 768)) + x_94 = None + hidden_states_72 = torch.nn.functional.dropout(x_95, 0.1, False, False) + x_95 = None + hidden_states_73 = hidden_states_69 + hidden_states_72 + hidden_states_69 = hidden_states_72 = None + hidden_states_74 = torch.nn.functional.layer_norm( + hidden_states_73, + (768,), + l_self_modules_ln_f_parameters_weight_, + l_self_modules_ln_f_parameters_bias_, + 1e-05, + ) + hidden_states_73 = ( + l_self_modules_ln_f_parameters_weight_ + ) = l_self_modules_ln_f_parameters_bias_ = None + hidden_states_75 = hidden_states_74.view((-1, 2, 768)) + hidden_states_74 = None + return ( + value_states_1, + key_states_1, + value_states_3, + key_states_3, + value_states_5, + key_states_5, + value_states_7, + key_states_7, + value_states_9, + key_states_9, + value_states_11, + key_states_11, + value_states_13, + key_states_13, + value_states_15, + key_states_15, + value_states_17, + key_states_17, + value_states_19, + key_states_19, + value_states_21, + key_states_21, + value_states_23, + key_states_23, + hidden_states_75, + ) diff --git a/samples/transformers-auto-model/openai-community/gpt2/weight_meta.py b/samples/transformers-auto-model/openai-community/gpt2/weight_meta.py new file mode 100644 index 000000000..33b95deda --- /dev/null +++ b/samples/transformers-auto-model/openai-community/gpt2/weight_meta.py @@ -0,0 +1,1498 @@ +class Program_weight_tensor_meta_L_self_modules_wte_parameters_weight_: + name = "L_self_modules_wte_parameters_weight_" + shape = [50257, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_input_ids_: + name = "L_input_ids_" + shape = [1, 2] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [15496, 995] + + +class Program_weight_tensor_meta_L_self_modules_wpe_parameters_weight_: + name = "L_self_modules_wpe_parameters_weight_" + shape = [1024, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 2] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1] + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_ln_1_parameters_weight_: + name = "L_self_modules_h_modules_0_modules_ln_1_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_ln_1_parameters_bias_: + name = "L_self_modules_h_modules_0_modules_ln_1_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_attn_modules_c_attn_parameters_bias_: + name = "L_self_modules_h_modules_0_modules_attn_modules_c_attn_parameters_bias_" + shape = [2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_attn_modules_c_attn_parameters_weight_: + name = "L_self_modules_h_modules_0_modules_attn_modules_c_attn_parameters_weight_" + shape = [768, 2304] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_attn_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_0_modules_attn_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_attn_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_0_modules_attn_modules_c_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_ln_2_parameters_weight_: + name = "L_self_modules_h_modules_0_modules_ln_2_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_ln_2_parameters_bias_: + name = "L_self_modules_h_modules_0_modules_ln_2_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_mlp_modules_c_fc_parameters_bias_: + name = "L_self_modules_h_modules_0_modules_mlp_modules_c_fc_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_mlp_modules_c_fc_parameters_weight_: + name = "L_self_modules_h_modules_0_modules_mlp_modules_c_fc_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_mlp_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_0_modules_mlp_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_mlp_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_0_modules_mlp_modules_c_proj_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_ln_1_parameters_weight_: + name = "L_self_modules_h_modules_1_modules_ln_1_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_ln_1_parameters_bias_: + name = "L_self_modules_h_modules_1_modules_ln_1_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_attn_modules_c_attn_parameters_bias_: + name = "L_self_modules_h_modules_1_modules_attn_modules_c_attn_parameters_bias_" + shape = [2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_attn_modules_c_attn_parameters_weight_: + name = "L_self_modules_h_modules_1_modules_attn_modules_c_attn_parameters_weight_" + shape = [768, 2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_attn_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_1_modules_attn_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_attn_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_1_modules_attn_modules_c_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_ln_2_parameters_weight_: + name = "L_self_modules_h_modules_1_modules_ln_2_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_ln_2_parameters_bias_: + name = "L_self_modules_h_modules_1_modules_ln_2_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_mlp_modules_c_fc_parameters_bias_: + name = "L_self_modules_h_modules_1_modules_mlp_modules_c_fc_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_mlp_modules_c_fc_parameters_weight_: + name = "L_self_modules_h_modules_1_modules_mlp_modules_c_fc_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_mlp_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_1_modules_mlp_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_mlp_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_1_modules_mlp_modules_c_proj_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_ln_1_parameters_weight_: + name = "L_self_modules_h_modules_2_modules_ln_1_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_ln_1_parameters_bias_: + name = "L_self_modules_h_modules_2_modules_ln_1_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_attn_modules_c_attn_parameters_bias_: + name = "L_self_modules_h_modules_2_modules_attn_modules_c_attn_parameters_bias_" + shape = [2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_attn_modules_c_attn_parameters_weight_: + name = "L_self_modules_h_modules_2_modules_attn_modules_c_attn_parameters_weight_" + shape = [768, 2304] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_attn_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_2_modules_attn_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_attn_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_2_modules_attn_modules_c_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_ln_2_parameters_weight_: + name = "L_self_modules_h_modules_2_modules_ln_2_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_ln_2_parameters_bias_: + name = "L_self_modules_h_modules_2_modules_ln_2_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_mlp_modules_c_fc_parameters_bias_: + name = "L_self_modules_h_modules_2_modules_mlp_modules_c_fc_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_mlp_modules_c_fc_parameters_weight_: + name = "L_self_modules_h_modules_2_modules_mlp_modules_c_fc_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_mlp_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_2_modules_mlp_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_mlp_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_2_modules_mlp_modules_c_proj_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_ln_1_parameters_weight_: + name = "L_self_modules_h_modules_3_modules_ln_1_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_ln_1_parameters_bias_: + name = "L_self_modules_h_modules_3_modules_ln_1_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_attn_modules_c_attn_parameters_bias_: + name = "L_self_modules_h_modules_3_modules_attn_modules_c_attn_parameters_bias_" + shape = [2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_attn_modules_c_attn_parameters_weight_: + name = "L_self_modules_h_modules_3_modules_attn_modules_c_attn_parameters_weight_" + shape = [768, 2304] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_attn_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_3_modules_attn_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_attn_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_3_modules_attn_modules_c_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_ln_2_parameters_weight_: + name = "L_self_modules_h_modules_3_modules_ln_2_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_ln_2_parameters_bias_: + name = "L_self_modules_h_modules_3_modules_ln_2_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_mlp_modules_c_fc_parameters_bias_: + name = "L_self_modules_h_modules_3_modules_mlp_modules_c_fc_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_mlp_modules_c_fc_parameters_weight_: + name = "L_self_modules_h_modules_3_modules_mlp_modules_c_fc_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_mlp_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_3_modules_mlp_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_mlp_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_3_modules_mlp_modules_c_proj_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_ln_1_parameters_weight_: + name = "L_self_modules_h_modules_4_modules_ln_1_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_ln_1_parameters_bias_: + name = "L_self_modules_h_modules_4_modules_ln_1_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_attn_modules_c_attn_parameters_bias_: + name = "L_self_modules_h_modules_4_modules_attn_modules_c_attn_parameters_bias_" + shape = [2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_attn_modules_c_attn_parameters_weight_: + name = "L_self_modules_h_modules_4_modules_attn_modules_c_attn_parameters_weight_" + shape = [768, 2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_attn_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_4_modules_attn_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_attn_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_4_modules_attn_modules_c_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_ln_2_parameters_weight_: + name = "L_self_modules_h_modules_4_modules_ln_2_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_ln_2_parameters_bias_: + name = "L_self_modules_h_modules_4_modules_ln_2_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_mlp_modules_c_fc_parameters_bias_: + name = "L_self_modules_h_modules_4_modules_mlp_modules_c_fc_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_mlp_modules_c_fc_parameters_weight_: + name = "L_self_modules_h_modules_4_modules_mlp_modules_c_fc_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_mlp_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_4_modules_mlp_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_mlp_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_4_modules_mlp_modules_c_proj_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_ln_1_parameters_weight_: + name = "L_self_modules_h_modules_5_modules_ln_1_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_ln_1_parameters_bias_: + name = "L_self_modules_h_modules_5_modules_ln_1_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_attn_modules_c_attn_parameters_bias_: + name = "L_self_modules_h_modules_5_modules_attn_modules_c_attn_parameters_bias_" + shape = [2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_attn_modules_c_attn_parameters_weight_: + name = "L_self_modules_h_modules_5_modules_attn_modules_c_attn_parameters_weight_" + shape = [768, 2304] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_attn_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_5_modules_attn_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_attn_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_5_modules_attn_modules_c_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_ln_2_parameters_weight_: + name = "L_self_modules_h_modules_5_modules_ln_2_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_ln_2_parameters_bias_: + name = "L_self_modules_h_modules_5_modules_ln_2_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_mlp_modules_c_fc_parameters_bias_: + name = "L_self_modules_h_modules_5_modules_mlp_modules_c_fc_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_mlp_modules_c_fc_parameters_weight_: + name = "L_self_modules_h_modules_5_modules_mlp_modules_c_fc_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_mlp_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_5_modules_mlp_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_mlp_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_5_modules_mlp_modules_c_proj_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_ln_1_parameters_weight_: + name = "L_self_modules_h_modules_6_modules_ln_1_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_ln_1_parameters_bias_: + name = "L_self_modules_h_modules_6_modules_ln_1_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_attn_modules_c_attn_parameters_bias_: + name = "L_self_modules_h_modules_6_modules_attn_modules_c_attn_parameters_bias_" + shape = [2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_attn_modules_c_attn_parameters_weight_: + name = "L_self_modules_h_modules_6_modules_attn_modules_c_attn_parameters_weight_" + shape = [768, 2304] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_attn_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_6_modules_attn_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_attn_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_6_modules_attn_modules_c_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_ln_2_parameters_weight_: + name = "L_self_modules_h_modules_6_modules_ln_2_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_ln_2_parameters_bias_: + name = "L_self_modules_h_modules_6_modules_ln_2_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_mlp_modules_c_fc_parameters_bias_: + name = "L_self_modules_h_modules_6_modules_mlp_modules_c_fc_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_mlp_modules_c_fc_parameters_weight_: + name = "L_self_modules_h_modules_6_modules_mlp_modules_c_fc_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_mlp_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_6_modules_mlp_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_mlp_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_6_modules_mlp_modules_c_proj_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_ln_1_parameters_weight_: + name = "L_self_modules_h_modules_7_modules_ln_1_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_ln_1_parameters_bias_: + name = "L_self_modules_h_modules_7_modules_ln_1_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_attn_modules_c_attn_parameters_bias_: + name = "L_self_modules_h_modules_7_modules_attn_modules_c_attn_parameters_bias_" + shape = [2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_attn_modules_c_attn_parameters_weight_: + name = "L_self_modules_h_modules_7_modules_attn_modules_c_attn_parameters_weight_" + shape = [768, 2304] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_attn_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_7_modules_attn_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_attn_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_7_modules_attn_modules_c_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_ln_2_parameters_weight_: + name = "L_self_modules_h_modules_7_modules_ln_2_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_ln_2_parameters_bias_: + name = "L_self_modules_h_modules_7_modules_ln_2_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_mlp_modules_c_fc_parameters_bias_: + name = "L_self_modules_h_modules_7_modules_mlp_modules_c_fc_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_mlp_modules_c_fc_parameters_weight_: + name = "L_self_modules_h_modules_7_modules_mlp_modules_c_fc_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_mlp_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_7_modules_mlp_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_mlp_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_7_modules_mlp_modules_c_proj_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_ln_1_parameters_weight_: + name = "L_self_modules_h_modules_8_modules_ln_1_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_ln_1_parameters_bias_: + name = "L_self_modules_h_modules_8_modules_ln_1_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_attn_modules_c_attn_parameters_bias_: + name = "L_self_modules_h_modules_8_modules_attn_modules_c_attn_parameters_bias_" + shape = [2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_attn_modules_c_attn_parameters_weight_: + name = "L_self_modules_h_modules_8_modules_attn_modules_c_attn_parameters_weight_" + shape = [768, 2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_attn_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_8_modules_attn_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_attn_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_8_modules_attn_modules_c_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_ln_2_parameters_weight_: + name = "L_self_modules_h_modules_8_modules_ln_2_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_ln_2_parameters_bias_: + name = "L_self_modules_h_modules_8_modules_ln_2_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_mlp_modules_c_fc_parameters_bias_: + name = "L_self_modules_h_modules_8_modules_mlp_modules_c_fc_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_mlp_modules_c_fc_parameters_weight_: + name = "L_self_modules_h_modules_8_modules_mlp_modules_c_fc_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_mlp_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_8_modules_mlp_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_mlp_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_8_modules_mlp_modules_c_proj_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_ln_1_parameters_weight_: + name = "L_self_modules_h_modules_9_modules_ln_1_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_ln_1_parameters_bias_: + name = "L_self_modules_h_modules_9_modules_ln_1_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_attn_modules_c_attn_parameters_bias_: + name = "L_self_modules_h_modules_9_modules_attn_modules_c_attn_parameters_bias_" + shape = [2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_attn_modules_c_attn_parameters_weight_: + name = "L_self_modules_h_modules_9_modules_attn_modules_c_attn_parameters_weight_" + shape = [768, 2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_attn_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_9_modules_attn_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_attn_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_9_modules_attn_modules_c_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_ln_2_parameters_weight_: + name = "L_self_modules_h_modules_9_modules_ln_2_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_ln_2_parameters_bias_: + name = "L_self_modules_h_modules_9_modules_ln_2_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_mlp_modules_c_fc_parameters_bias_: + name = "L_self_modules_h_modules_9_modules_mlp_modules_c_fc_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_mlp_modules_c_fc_parameters_weight_: + name = "L_self_modules_h_modules_9_modules_mlp_modules_c_fc_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_mlp_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_9_modules_mlp_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_mlp_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_9_modules_mlp_modules_c_proj_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_ln_1_parameters_weight_: + name = "L_self_modules_h_modules_10_modules_ln_1_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_ln_1_parameters_bias_: + name = "L_self_modules_h_modules_10_modules_ln_1_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_attn_modules_c_attn_parameters_bias_: + name = "L_self_modules_h_modules_10_modules_attn_modules_c_attn_parameters_bias_" + shape = [2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_attn_modules_c_attn_parameters_weight_: + name = "L_self_modules_h_modules_10_modules_attn_modules_c_attn_parameters_weight_" + shape = [768, 2304] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_attn_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_10_modules_attn_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_attn_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_10_modules_attn_modules_c_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_ln_2_parameters_weight_: + name = "L_self_modules_h_modules_10_modules_ln_2_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_ln_2_parameters_bias_: + name = "L_self_modules_h_modules_10_modules_ln_2_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_mlp_modules_c_fc_parameters_bias_: + name = "L_self_modules_h_modules_10_modules_mlp_modules_c_fc_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_mlp_modules_c_fc_parameters_weight_: + name = "L_self_modules_h_modules_10_modules_mlp_modules_c_fc_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_mlp_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_10_modules_mlp_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_mlp_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_10_modules_mlp_modules_c_proj_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_ln_1_parameters_weight_: + name = "L_self_modules_h_modules_11_modules_ln_1_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_ln_1_parameters_bias_: + name = "L_self_modules_h_modules_11_modules_ln_1_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_attn_modules_c_attn_parameters_bias_: + name = "L_self_modules_h_modules_11_modules_attn_modules_c_attn_parameters_bias_" + shape = [2304] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_attn_modules_c_attn_parameters_weight_: + name = "L_self_modules_h_modules_11_modules_attn_modules_c_attn_parameters_weight_" + shape = [768, 2304] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_attn_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_11_modules_attn_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_attn_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_11_modules_attn_modules_c_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_ln_2_parameters_weight_: + name = "L_self_modules_h_modules_11_modules_ln_2_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_ln_2_parameters_bias_: + name = "L_self_modules_h_modules_11_modules_ln_2_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_mlp_modules_c_fc_parameters_bias_: + name = "L_self_modules_h_modules_11_modules_mlp_modules_c_fc_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_mlp_modules_c_fc_parameters_weight_: + name = "L_self_modules_h_modules_11_modules_mlp_modules_c_fc_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_mlp_modules_c_proj_parameters_bias_: + name = "L_self_modules_h_modules_11_modules_mlp_modules_c_proj_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_mlp_modules_c_proj_parameters_weight_: + name = "L_self_modules_h_modules_11_modules_mlp_modules_c_proj_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.004 + data = None + + +class Program_weight_tensor_meta_L_self_modules_ln_f_parameters_weight_: + name = "L_self_modules_ln_f_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_ln_f_parameters_bias_: + name = "L_self_modules_ln_f_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None From 81153ddee698b19625508acfced01ab3fbdd65a9 Mon Sep 17 00:00:00 2001 From: ooooo <3164076421@qq.com> Date: Wed, 20 Aug 2025 09:08:53 +0800 Subject: [PATCH 2/2] [New Sample] Add Some Text Generation Computational Graph --- .../EleutherAI/pythia-1b/graph_hash.txt | 1 + .../EleutherAI/pythia-1b/graph_net.json | 6 + .../EleutherAI/pythia-1b/input_meta.py | 0 .../pythia-1b/input_tensor_constraints.py | 0 .../EleutherAI/pythia-1b/model.py | 2660 ++++++ .../EleutherAI/pythia-1b/weight_meta.py | 2001 +++++ .../HuggingFaceTB/SmolLM3-3B/graph_hash.txt | 1 + .../HuggingFaceTB/SmolLM3-3B/graph_net.json | 6 + .../HuggingFaceTB/SmolLM3-3B/input_meta.py | 0 .../SmolLM3-3B/input_tensor_constraints.py | 0 .../HuggingFaceTB/SmolLM3-3B/model.py | 7500 +++++++++++++++++ .../HuggingFaceTB/SmolLM3-3B/weight_meta.py | 3288 ++++++++ .../TinyLlama-1.1B-Chat-v0.4/graph_hash.txt | 1 + .../TinyLlama-1.1B-Chat-v0.4/graph_net.json | 6 + .../TinyLlama-1.1B-Chat-v0.4/input_meta.py | 0 .../input_tensor_constraints.py | 0 .../TinyLlama-1.1B-Chat-v0.4/model.py | 4773 +++++++++++ .../TinyLlama-1.1B-Chat-v0.4/weight_meta.py | 2061 +++++ .../bigscience/bloom-560m/graph_hash.txt | 1 + .../bigscience/bloom-560m/graph_net.json | 6 + .../bigscience/bloom-560m/input_meta.py | 0 .../bloom-560m/input_tensor_constraints.py | 0 .../bigscience/bloom-560m/model.py | 3799 +++++++++ .../bigscience/bloom-560m/weight_meta.py | 3102 +++++++ 24 files changed, 29212 insertions(+) create mode 100644 samples/transformers-auto-model/EleutherAI/pythia-1b/graph_hash.txt create mode 100644 samples/transformers-auto-model/EleutherAI/pythia-1b/graph_net.json create mode 100644 samples/transformers-auto-model/EleutherAI/pythia-1b/input_meta.py create mode 100644 samples/transformers-auto-model/EleutherAI/pythia-1b/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/EleutherAI/pythia-1b/model.py create mode 100644 samples/transformers-auto-model/EleutherAI/pythia-1b/weight_meta.py create mode 100644 samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/graph_hash.txt create mode 100644 samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/graph_net.json create mode 100644 samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/input_meta.py create mode 100644 samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/model.py create mode 100644 samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/weight_meta.py create mode 100644 samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/graph_hash.txt create mode 100644 samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/graph_net.json create mode 100644 samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/input_meta.py create mode 100644 samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/model.py create mode 100644 samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/weight_meta.py create mode 100644 samples/transformers-auto-model/bigscience/bloom-560m/graph_hash.txt create mode 100644 samples/transformers-auto-model/bigscience/bloom-560m/graph_net.json create mode 100644 samples/transformers-auto-model/bigscience/bloom-560m/input_meta.py create mode 100644 samples/transformers-auto-model/bigscience/bloom-560m/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/bigscience/bloom-560m/model.py create mode 100644 samples/transformers-auto-model/bigscience/bloom-560m/weight_meta.py diff --git a/samples/transformers-auto-model/EleutherAI/pythia-1b/graph_hash.txt b/samples/transformers-auto-model/EleutherAI/pythia-1b/graph_hash.txt new file mode 100644 index 000000000..ce6e9b10a --- /dev/null +++ b/samples/transformers-auto-model/EleutherAI/pythia-1b/graph_hash.txt @@ -0,0 +1 @@ +41b77f6806eb4bb399446db2319bd7d2d36a108a626158bd2fe3ddad929c156d \ No newline at end of file diff --git a/samples/transformers-auto-model/EleutherAI/pythia-1b/graph_net.json b/samples/transformers-auto-model/EleutherAI/pythia-1b/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/EleutherAI/pythia-1b/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/EleutherAI/pythia-1b/input_meta.py b/samples/transformers-auto-model/EleutherAI/pythia-1b/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/EleutherAI/pythia-1b/input_tensor_constraints.py b/samples/transformers-auto-model/EleutherAI/pythia-1b/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/EleutherAI/pythia-1b/model.py b/samples/transformers-auto-model/EleutherAI/pythia-1b/model.py new file mode 100644 index 000000000..c999671fe --- /dev/null +++ b/samples/transformers-auto-model/EleutherAI/pythia-1b/model.py @@ -0,0 +1,2660 @@ +import torch + +from torch import device + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_inputs_embeds_: torch.Tensor, + L_attention_mask_: torch.Tensor, + L_self_modules_rotary_emb_buffers_inv_freq_: torch.Tensor, + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_layers_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_final_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_final_layer_norm_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_inputs_embeds_ = L_inputs_embeds_ + l_attention_mask_ = L_attention_mask_ + l_self_modules_rotary_emb_buffers_inv_freq_ = ( + L_self_modules_rotary_emb_buffers_inv_freq_ + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_0_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_0_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_0_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_0_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_0_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_0_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_0_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_0_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_1_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_1_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_1_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_1_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_1_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_1_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_1_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_1_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_2_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_2_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_2_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_2_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_2_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_2_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_2_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_2_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_3_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_3_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_3_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_3_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_3_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_3_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_3_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_3_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_4_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_4_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_4_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_4_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_4_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_4_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_4_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_4_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_5_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_5_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_5_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_5_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_5_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_5_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_5_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_5_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_6_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_6_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_6_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_6_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_6_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_6_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_6_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_6_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_7_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_7_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_7_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_7_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_7_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_7_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_7_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_7_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_8_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_8_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_8_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_8_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_8_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_8_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_8_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_8_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_9_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_9_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_9_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_9_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_9_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_9_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_9_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_9_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_10_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_10_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_10_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_10_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_10_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_10_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_10_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_10_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_11_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_11_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_11_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_11_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_11_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_11_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_11_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_11_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_12_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_12_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_12_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_12_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_12_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_12_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_12_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_12_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_13_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_13_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_13_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_13_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_13_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_13_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_13_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_13_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_14_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_14_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_14_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_14_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_14_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_14_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_14_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_14_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_layers_modules_15_modules_attention_modules_query_key_value_parameters_weight_ = L_self_modules_layers_modules_15_modules_attention_modules_query_key_value_parameters_weight_ + l_self_modules_layers_modules_15_modules_attention_modules_query_key_value_parameters_bias_ = L_self_modules_layers_modules_15_modules_attention_modules_query_key_value_parameters_bias_ + l_self_modules_layers_modules_15_modules_attention_modules_dense_parameters_weight_ = L_self_modules_layers_modules_15_modules_attention_modules_dense_parameters_weight_ + l_self_modules_layers_modules_15_modules_attention_modules_dense_parameters_bias_ = L_self_modules_layers_modules_15_modules_attention_modules_dense_parameters_bias_ + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_layers_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_layers_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_layers_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_layers_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_final_layer_norm_parameters_weight_ = ( + L_self_modules_final_layer_norm_parameters_weight_ + ) + l_self_modules_final_layer_norm_parameters_bias_ = ( + L_self_modules_final_layer_norm_parameters_bias_ + ) + cache_position = torch.arange(0, 2, device=device(type="cuda", index=0)) + position_ids = cache_position.unsqueeze(0) + attention_mask = l_attention_mask_.to( + device=device(type="cuda", index=0), dtype=torch.bool + ) + l_attention_mask_ = None + mask_indices = torch.arange(2, device=device(type="cuda", index=0)) + mask_indices += 0 + mask_indices_1 = mask_indices + mask_indices = None + local_padding_mask = attention_mask[(slice(None, None, None), mask_indices_1)] + attention_mask = mask_indices_1 = None + kv_arange = torch.arange(2, device=device(type="cuda", index=0)) + kv_arange += 0 + kv_arange_1 = kv_arange + kv_arange = None + reshaped_cache_position = cache_position.view(-1, 1) + cache_position = None + causal_mask = kv_arange_1 <= reshaped_cache_position + kv_arange_1 = reshaped_cache_position = None + getitem_1 = causal_mask[ + (None, None, slice(None, None, None), slice(None, None, None)) + ] + causal_mask = None + causal_mask_1 = getitem_1.expand(1, -1, -1, -1) + getitem_1 = None + getitem_2 = local_padding_mask[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + local_padding_mask = None + causal_mask_2 = causal_mask_1 * getitem_2 + causal_mask_1 = getitem_2 = None + hidden_states = torch.nn.functional.dropout(l_inputs_embeds_, 0.0, False, False) + l_inputs_embeds_ = None + _set_grad_enabled = torch._C._set_grad_enabled(False) + _set_grad_enabled = None + getitem_3 = l_self_modules_rotary_emb_buffers_inv_freq_[ + (None, slice(None, None, None), None) + ] + l_self_modules_rotary_emb_buffers_inv_freq_ = None + float_1 = getitem_3.float() + getitem_3 = None + expand_1 = float_1.expand(1, -1, 1) + float_1 = None + inv_freq_expanded = expand_1.to(device(type="cuda", index=0)) + expand_1 = None + getitem_4 = position_ids[ + (slice(None, None, None), None, slice(None, None, None)) + ] + position_ids = None + position_ids_expanded = getitem_4.float() + getitem_4 = None + float_3 = inv_freq_expanded.float() + inv_freq_expanded = None + float_4 = position_ids_expanded.float() + position_ids_expanded = None + matmul = float_3 @ float_4 + float_3 = float_4 = None + freqs = matmul.transpose(1, 2) + matmul = None + emb = torch.cat((freqs, freqs), dim=-1) + freqs = None + cos = emb.cos() + cos_1 = cos * 1.0 + cos = None + sin = emb.sin() + emb = None + sin_1 = sin * 1.0 + sin = None + cos_2 = cos_1.to(dtype=torch.float16) + cos_1 = None + sin_2 = sin_1.to(dtype=torch.float16) + sin_1 = None + _set_grad_enabled_1 = torch._C._set_grad_enabled(True) + _set_grad_enabled_1 = None + layer_norm = torch.nn.functional.layer_norm( + hidden_states, + (2048,), + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_ + ) = None + linear = torch._C._nn.linear( + layer_norm, + l_self_modules_layers_modules_0_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_0_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm = l_self_modules_layers_modules_0_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_0_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_1 = linear.view((1, 2, -1, 768)) + linear = None + qkv = view_1.transpose(1, 2) + view_1 = None + chunk = qkv.chunk(3, dim=-1) + qkv = None + query_states = chunk[0] + key_states = chunk[1] + value_states = chunk[2] + chunk = None + cos_3 = cos_2.unsqueeze(1) + sin_3 = sin_2.unsqueeze(1) + q_rot = query_states[(Ellipsis, slice(None, 64, None))] + q_pass = query_states[(Ellipsis, slice(64, None, None))] + query_states = None + k_rot = key_states[(Ellipsis, slice(None, 64, None))] + k_pass = key_states[(Ellipsis, slice(64, None, None))] + key_states = None + mul_3 = q_rot * cos_3 + x1 = q_rot[(Ellipsis, slice(None, 32, None))] + x2 = q_rot[(Ellipsis, slice(32, None, None))] + q_rot = None + neg = -x2 + x2 = None + cat_1 = torch.cat((neg, x1), dim=-1) + neg = x1 = None + mul_4 = cat_1 * sin_3 + cat_1 = None + q_embed = mul_3 + mul_4 + mul_3 = mul_4 = None + mul_5 = k_rot * cos_3 + cos_3 = None + x1_1 = k_rot[(Ellipsis, slice(None, 32, None))] + x2_1 = k_rot[(Ellipsis, slice(32, None, None))] + k_rot = None + neg_1 = -x2_1 + x2_1 = None + cat_2 = torch.cat((neg_1, x1_1), dim=-1) + neg_1 = x1_1 = None + mul_6 = cat_2 * sin_3 + cat_2 = sin_3 = None + k_embed = mul_5 + mul_6 + mul_5 = mul_6 = None + q_embed_1 = torch.cat([q_embed, q_pass], dim=-1) + q_embed = q_pass = None + k_embed_1 = torch.cat([k_embed, k_pass], dim=-1) + k_embed = k_pass = None + attention_mask_1 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query = q_embed_1.contiguous() + q_embed_1 = None + key = k_embed_1.contiguous() + value = value_states.contiguous() + attn_output = torch._C._nn.scaled_dot_product_attention( + query, + key, + value, + attn_mask=attention_mask_1, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query = key = value = attention_mask_1 = None + transpose_2 = attn_output.transpose(1, 2) + attn_output = None + attn_output_1 = transpose_2.contiguous() + transpose_2 = None + reshape = attn_output_1.reshape(1, 2, -1) + attn_output_1 = None + attn_output_2 = reshape.contiguous() + reshape = None + attn_output_3 = torch._C._nn.linear( + attn_output_2, + l_self_modules_layers_modules_0_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_0_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_2 = l_self_modules_layers_modules_0_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_0_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_4 = torch.nn.functional.dropout(attn_output_3, 0.0, False, False) + attn_output_3 = None + layer_norm_1 = torch.nn.functional.layer_norm( + hidden_states, + (2048,), + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_1 = torch._C._nn.linear( + layer_norm_1, + l_self_modules_layers_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_1 = l_self_modules_layers_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_2 = torch._C._nn.gelu(hidden_states_1) + hidden_states_1 = None + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_layers_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_2 = l_self_modules_layers_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output = torch.nn.functional.dropout(hidden_states_3, 0.0, False, False) + hidden_states_3 = None + add_2 = mlp_output + attn_output_4 + mlp_output = attn_output_4 = None + hidden_states_4 = add_2 + hidden_states + add_2 = hidden_states = None + layer_norm_2 = torch.nn.functional.layer_norm( + hidden_states_4, + (2048,), + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_ + ) = None + linear_4 = torch._C._nn.linear( + layer_norm_2, + l_self_modules_layers_modules_1_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_1_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm_2 = l_self_modules_layers_modules_1_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_1_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_2 = linear_4.view((1, 2, -1, 768)) + linear_4 = None + qkv_1 = view_2.transpose(1, 2) + view_2 = None + chunk_1 = qkv_1.chunk(3, dim=-1) + qkv_1 = None + query_states_1 = chunk_1[0] + key_states_1 = chunk_1[1] + value_states_1 = chunk_1[2] + chunk_1 = None + cos_4 = cos_2.unsqueeze(1) + sin_4 = sin_2.unsqueeze(1) + q_rot_1 = query_states_1[(Ellipsis, slice(None, 64, None))] + q_pass_1 = query_states_1[(Ellipsis, slice(64, None, None))] + query_states_1 = None + k_rot_1 = key_states_1[(Ellipsis, slice(None, 64, None))] + k_pass_1 = key_states_1[(Ellipsis, slice(64, None, None))] + key_states_1 = None + mul_7 = q_rot_1 * cos_4 + x1_2 = q_rot_1[(Ellipsis, slice(None, 32, None))] + x2_2 = q_rot_1[(Ellipsis, slice(32, None, None))] + q_rot_1 = None + neg_2 = -x2_2 + x2_2 = None + cat_5 = torch.cat((neg_2, x1_2), dim=-1) + neg_2 = x1_2 = None + mul_8 = cat_5 * sin_4 + cat_5 = None + q_embed_2 = mul_7 + mul_8 + mul_7 = mul_8 = None + mul_9 = k_rot_1 * cos_4 + cos_4 = None + x1_3 = k_rot_1[(Ellipsis, slice(None, 32, None))] + x2_3 = k_rot_1[(Ellipsis, slice(32, None, None))] + k_rot_1 = None + neg_3 = -x2_3 + x2_3 = None + cat_6 = torch.cat((neg_3, x1_3), dim=-1) + neg_3 = x1_3 = None + mul_10 = cat_6 * sin_4 + cat_6 = sin_4 = None + k_embed_2 = mul_9 + mul_10 + mul_9 = mul_10 = None + q_embed_3 = torch.cat([q_embed_2, q_pass_1], dim=-1) + q_embed_2 = q_pass_1 = None + k_embed_3 = torch.cat([k_embed_2, k_pass_1], dim=-1) + k_embed_2 = k_pass_1 = None + attention_mask_2 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_1 = q_embed_3.contiguous() + q_embed_3 = None + key_1 = k_embed_3.contiguous() + value_1 = value_states_1.contiguous() + attn_output_5 = torch._C._nn.scaled_dot_product_attention( + query_1, + key_1, + value_1, + attn_mask=attention_mask_2, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query_1 = key_1 = value_1 = attention_mask_2 = None + transpose_4 = attn_output_5.transpose(1, 2) + attn_output_5 = None + attn_output_6 = transpose_4.contiguous() + transpose_4 = None + reshape_1 = attn_output_6.reshape(1, 2, -1) + attn_output_6 = None + attn_output_7 = reshape_1.contiguous() + reshape_1 = None + attn_output_8 = torch._C._nn.linear( + attn_output_7, + l_self_modules_layers_modules_1_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_1_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_7 = l_self_modules_layers_modules_1_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_1_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_9 = torch.nn.functional.dropout(attn_output_8, 0.0, False, False) + attn_output_8 = None + layer_norm_3 = torch.nn.functional.layer_norm( + hidden_states_4, + (2048,), + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_5 = torch._C._nn.linear( + layer_norm_3, + l_self_modules_layers_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_3 = l_self_modules_layers_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_6 = torch._C._nn.gelu(hidden_states_5) + hidden_states_5 = None + hidden_states_7 = torch._C._nn.linear( + hidden_states_6, + l_self_modules_layers_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_6 = l_self_modules_layers_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output_1 = torch.nn.functional.dropout(hidden_states_7, 0.0, False, False) + hidden_states_7 = None + add_6 = mlp_output_1 + attn_output_9 + mlp_output_1 = attn_output_9 = None + hidden_states_8 = add_6 + hidden_states_4 + add_6 = hidden_states_4 = None + layer_norm_4 = torch.nn.functional.layer_norm( + hidden_states_8, + (2048,), + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_ + ) = None + linear_8 = torch._C._nn.linear( + layer_norm_4, + l_self_modules_layers_modules_2_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_2_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm_4 = l_self_modules_layers_modules_2_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_2_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_3 = linear_8.view((1, 2, -1, 768)) + linear_8 = None + qkv_2 = view_3.transpose(1, 2) + view_3 = None + chunk_2 = qkv_2.chunk(3, dim=-1) + qkv_2 = None + query_states_2 = chunk_2[0] + key_states_2 = chunk_2[1] + value_states_2 = chunk_2[2] + chunk_2 = None + cos_5 = cos_2.unsqueeze(1) + sin_5 = sin_2.unsqueeze(1) + q_rot_2 = query_states_2[(Ellipsis, slice(None, 64, None))] + q_pass_2 = query_states_2[(Ellipsis, slice(64, None, None))] + query_states_2 = None + k_rot_2 = key_states_2[(Ellipsis, slice(None, 64, None))] + k_pass_2 = key_states_2[(Ellipsis, slice(64, None, None))] + key_states_2 = None + mul_11 = q_rot_2 * cos_5 + x1_4 = q_rot_2[(Ellipsis, slice(None, 32, None))] + x2_4 = q_rot_2[(Ellipsis, slice(32, None, None))] + q_rot_2 = None + neg_4 = -x2_4 + x2_4 = None + cat_9 = torch.cat((neg_4, x1_4), dim=-1) + neg_4 = x1_4 = None + mul_12 = cat_9 * sin_5 + cat_9 = None + q_embed_4 = mul_11 + mul_12 + mul_11 = mul_12 = None + mul_13 = k_rot_2 * cos_5 + cos_5 = None + x1_5 = k_rot_2[(Ellipsis, slice(None, 32, None))] + x2_5 = k_rot_2[(Ellipsis, slice(32, None, None))] + k_rot_2 = None + neg_5 = -x2_5 + x2_5 = None + cat_10 = torch.cat((neg_5, x1_5), dim=-1) + neg_5 = x1_5 = None + mul_14 = cat_10 * sin_5 + cat_10 = sin_5 = None + k_embed_4 = mul_13 + mul_14 + mul_13 = mul_14 = None + q_embed_5 = torch.cat([q_embed_4, q_pass_2], dim=-1) + q_embed_4 = q_pass_2 = None + k_embed_5 = torch.cat([k_embed_4, k_pass_2], dim=-1) + k_embed_4 = k_pass_2 = None + attention_mask_3 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_2 = q_embed_5.contiguous() + q_embed_5 = None + key_2 = k_embed_5.contiguous() + value_2 = value_states_2.contiguous() + attn_output_10 = torch._C._nn.scaled_dot_product_attention( + query_2, + key_2, + value_2, + attn_mask=attention_mask_3, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query_2 = key_2 = value_2 = attention_mask_3 = None + transpose_6 = attn_output_10.transpose(1, 2) + attn_output_10 = None + attn_output_11 = transpose_6.contiguous() + transpose_6 = None + reshape_2 = attn_output_11.reshape(1, 2, -1) + attn_output_11 = None + attn_output_12 = reshape_2.contiguous() + reshape_2 = None + attn_output_13 = torch._C._nn.linear( + attn_output_12, + l_self_modules_layers_modules_2_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_2_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_12 = l_self_modules_layers_modules_2_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_2_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_14 = torch.nn.functional.dropout(attn_output_13, 0.0, False, False) + attn_output_13 = None + layer_norm_5 = torch.nn.functional.layer_norm( + hidden_states_8, + (2048,), + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_9 = torch._C._nn.linear( + layer_norm_5, + l_self_modules_layers_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_5 = l_self_modules_layers_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_10 = torch._C._nn.gelu(hidden_states_9) + hidden_states_9 = None + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_layers_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_10 = l_self_modules_layers_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output_2 = torch.nn.functional.dropout(hidden_states_11, 0.0, False, False) + hidden_states_11 = None + add_10 = mlp_output_2 + attn_output_14 + mlp_output_2 = attn_output_14 = None + hidden_states_12 = add_10 + hidden_states_8 + add_10 = hidden_states_8 = None + layer_norm_6 = torch.nn.functional.layer_norm( + hidden_states_12, + (2048,), + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_ + ) = None + linear_12 = torch._C._nn.linear( + layer_norm_6, + l_self_modules_layers_modules_3_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_3_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm_6 = l_self_modules_layers_modules_3_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_3_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_4 = linear_12.view((1, 2, -1, 768)) + linear_12 = None + qkv_3 = view_4.transpose(1, 2) + view_4 = None + chunk_3 = qkv_3.chunk(3, dim=-1) + qkv_3 = None + query_states_3 = chunk_3[0] + key_states_3 = chunk_3[1] + value_states_3 = chunk_3[2] + chunk_3 = None + cos_6 = cos_2.unsqueeze(1) + sin_6 = sin_2.unsqueeze(1) + q_rot_3 = query_states_3[(Ellipsis, slice(None, 64, None))] + q_pass_3 = query_states_3[(Ellipsis, slice(64, None, None))] + query_states_3 = None + k_rot_3 = key_states_3[(Ellipsis, slice(None, 64, None))] + k_pass_3 = key_states_3[(Ellipsis, slice(64, None, None))] + key_states_3 = None + mul_15 = q_rot_3 * cos_6 + x1_6 = q_rot_3[(Ellipsis, slice(None, 32, None))] + x2_6 = q_rot_3[(Ellipsis, slice(32, None, None))] + q_rot_3 = None + neg_6 = -x2_6 + x2_6 = None + cat_13 = torch.cat((neg_6, x1_6), dim=-1) + neg_6 = x1_6 = None + mul_16 = cat_13 * sin_6 + cat_13 = None + q_embed_6 = mul_15 + mul_16 + mul_15 = mul_16 = None + mul_17 = k_rot_3 * cos_6 + cos_6 = None + x1_7 = k_rot_3[(Ellipsis, slice(None, 32, None))] + x2_7 = k_rot_3[(Ellipsis, slice(32, None, None))] + k_rot_3 = None + neg_7 = -x2_7 + x2_7 = None + cat_14 = torch.cat((neg_7, x1_7), dim=-1) + neg_7 = x1_7 = None + mul_18 = cat_14 * sin_6 + cat_14 = sin_6 = None + k_embed_6 = mul_17 + mul_18 + mul_17 = mul_18 = None + q_embed_7 = torch.cat([q_embed_6, q_pass_3], dim=-1) + q_embed_6 = q_pass_3 = None + k_embed_7 = torch.cat([k_embed_6, k_pass_3], dim=-1) + k_embed_6 = k_pass_3 = None + attention_mask_4 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_3 = q_embed_7.contiguous() + q_embed_7 = None + key_3 = k_embed_7.contiguous() + value_3 = value_states_3.contiguous() + attn_output_15 = torch._C._nn.scaled_dot_product_attention( + query_3, + key_3, + value_3, + attn_mask=attention_mask_4, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query_3 = key_3 = value_3 = attention_mask_4 = None + transpose_8 = attn_output_15.transpose(1, 2) + attn_output_15 = None + attn_output_16 = transpose_8.contiguous() + transpose_8 = None + reshape_3 = attn_output_16.reshape(1, 2, -1) + attn_output_16 = None + attn_output_17 = reshape_3.contiguous() + reshape_3 = None + attn_output_18 = torch._C._nn.linear( + attn_output_17, + l_self_modules_layers_modules_3_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_3_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_17 = l_self_modules_layers_modules_3_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_3_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_19 = torch.nn.functional.dropout(attn_output_18, 0.0, False, False) + attn_output_18 = None + layer_norm_7 = torch.nn.functional.layer_norm( + hidden_states_12, + (2048,), + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_13 = torch._C._nn.linear( + layer_norm_7, + l_self_modules_layers_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_7 = l_self_modules_layers_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_14 = torch._C._nn.gelu(hidden_states_13) + hidden_states_13 = None + hidden_states_15 = torch._C._nn.linear( + hidden_states_14, + l_self_modules_layers_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_14 = l_self_modules_layers_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output_3 = torch.nn.functional.dropout(hidden_states_15, 0.0, False, False) + hidden_states_15 = None + add_14 = mlp_output_3 + attn_output_19 + mlp_output_3 = attn_output_19 = None + hidden_states_16 = add_14 + hidden_states_12 + add_14 = hidden_states_12 = None + layer_norm_8 = torch.nn.functional.layer_norm( + hidden_states_16, + (2048,), + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_ + ) = None + linear_16 = torch._C._nn.linear( + layer_norm_8, + l_self_modules_layers_modules_4_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_4_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm_8 = l_self_modules_layers_modules_4_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_4_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_5 = linear_16.view((1, 2, -1, 768)) + linear_16 = None + qkv_4 = view_5.transpose(1, 2) + view_5 = None + chunk_4 = qkv_4.chunk(3, dim=-1) + qkv_4 = None + query_states_4 = chunk_4[0] + key_states_4 = chunk_4[1] + value_states_4 = chunk_4[2] + chunk_4 = None + cos_7 = cos_2.unsqueeze(1) + sin_7 = sin_2.unsqueeze(1) + q_rot_4 = query_states_4[(Ellipsis, slice(None, 64, None))] + q_pass_4 = query_states_4[(Ellipsis, slice(64, None, None))] + query_states_4 = None + k_rot_4 = key_states_4[(Ellipsis, slice(None, 64, None))] + k_pass_4 = key_states_4[(Ellipsis, slice(64, None, None))] + key_states_4 = None + mul_19 = q_rot_4 * cos_7 + x1_8 = q_rot_4[(Ellipsis, slice(None, 32, None))] + x2_8 = q_rot_4[(Ellipsis, slice(32, None, None))] + q_rot_4 = None + neg_8 = -x2_8 + x2_8 = None + cat_17 = torch.cat((neg_8, x1_8), dim=-1) + neg_8 = x1_8 = None + mul_20 = cat_17 * sin_7 + cat_17 = None + q_embed_8 = mul_19 + mul_20 + mul_19 = mul_20 = None + mul_21 = k_rot_4 * cos_7 + cos_7 = None + x1_9 = k_rot_4[(Ellipsis, slice(None, 32, None))] + x2_9 = k_rot_4[(Ellipsis, slice(32, None, None))] + k_rot_4 = None + neg_9 = -x2_9 + x2_9 = None + cat_18 = torch.cat((neg_9, x1_9), dim=-1) + neg_9 = x1_9 = None + mul_22 = cat_18 * sin_7 + cat_18 = sin_7 = None + k_embed_8 = mul_21 + mul_22 + mul_21 = mul_22 = None + q_embed_9 = torch.cat([q_embed_8, q_pass_4], dim=-1) + q_embed_8 = q_pass_4 = None + k_embed_9 = torch.cat([k_embed_8, k_pass_4], dim=-1) + k_embed_8 = k_pass_4 = None + attention_mask_5 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_4 = q_embed_9.contiguous() + q_embed_9 = None + key_4 = k_embed_9.contiguous() + value_4 = value_states_4.contiguous() + attn_output_20 = torch._C._nn.scaled_dot_product_attention( + query_4, + key_4, + value_4, + attn_mask=attention_mask_5, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query_4 = key_4 = value_4 = attention_mask_5 = None + transpose_10 = attn_output_20.transpose(1, 2) + attn_output_20 = None + attn_output_21 = transpose_10.contiguous() + transpose_10 = None + reshape_4 = attn_output_21.reshape(1, 2, -1) + attn_output_21 = None + attn_output_22 = reshape_4.contiguous() + reshape_4 = None + attn_output_23 = torch._C._nn.linear( + attn_output_22, + l_self_modules_layers_modules_4_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_4_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_22 = l_self_modules_layers_modules_4_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_4_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_24 = torch.nn.functional.dropout(attn_output_23, 0.0, False, False) + attn_output_23 = None + layer_norm_9 = torch.nn.functional.layer_norm( + hidden_states_16, + (2048,), + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_17 = torch._C._nn.linear( + layer_norm_9, + l_self_modules_layers_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_9 = l_self_modules_layers_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_18 = torch._C._nn.gelu(hidden_states_17) + hidden_states_17 = None + hidden_states_19 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_layers_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_18 = l_self_modules_layers_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output_4 = torch.nn.functional.dropout(hidden_states_19, 0.0, False, False) + hidden_states_19 = None + add_18 = mlp_output_4 + attn_output_24 + mlp_output_4 = attn_output_24 = None + hidden_states_20 = add_18 + hidden_states_16 + add_18 = hidden_states_16 = None + layer_norm_10 = torch.nn.functional.layer_norm( + hidden_states_20, + (2048,), + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_ + ) = None + linear_20 = torch._C._nn.linear( + layer_norm_10, + l_self_modules_layers_modules_5_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_5_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm_10 = l_self_modules_layers_modules_5_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_5_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_6 = linear_20.view((1, 2, -1, 768)) + linear_20 = None + qkv_5 = view_6.transpose(1, 2) + view_6 = None + chunk_5 = qkv_5.chunk(3, dim=-1) + qkv_5 = None + query_states_5 = chunk_5[0] + key_states_5 = chunk_5[1] + value_states_5 = chunk_5[2] + chunk_5 = None + cos_8 = cos_2.unsqueeze(1) + sin_8 = sin_2.unsqueeze(1) + q_rot_5 = query_states_5[(Ellipsis, slice(None, 64, None))] + q_pass_5 = query_states_5[(Ellipsis, slice(64, None, None))] + query_states_5 = None + k_rot_5 = key_states_5[(Ellipsis, slice(None, 64, None))] + k_pass_5 = key_states_5[(Ellipsis, slice(64, None, None))] + key_states_5 = None + mul_23 = q_rot_5 * cos_8 + x1_10 = q_rot_5[(Ellipsis, slice(None, 32, None))] + x2_10 = q_rot_5[(Ellipsis, slice(32, None, None))] + q_rot_5 = None + neg_10 = -x2_10 + x2_10 = None + cat_21 = torch.cat((neg_10, x1_10), dim=-1) + neg_10 = x1_10 = None + mul_24 = cat_21 * sin_8 + cat_21 = None + q_embed_10 = mul_23 + mul_24 + mul_23 = mul_24 = None + mul_25 = k_rot_5 * cos_8 + cos_8 = None + x1_11 = k_rot_5[(Ellipsis, slice(None, 32, None))] + x2_11 = k_rot_5[(Ellipsis, slice(32, None, None))] + k_rot_5 = None + neg_11 = -x2_11 + x2_11 = None + cat_22 = torch.cat((neg_11, x1_11), dim=-1) + neg_11 = x1_11 = None + mul_26 = cat_22 * sin_8 + cat_22 = sin_8 = None + k_embed_10 = mul_25 + mul_26 + mul_25 = mul_26 = None + q_embed_11 = torch.cat([q_embed_10, q_pass_5], dim=-1) + q_embed_10 = q_pass_5 = None + k_embed_11 = torch.cat([k_embed_10, k_pass_5], dim=-1) + k_embed_10 = k_pass_5 = None + attention_mask_6 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_5 = q_embed_11.contiguous() + q_embed_11 = None + key_5 = k_embed_11.contiguous() + value_5 = value_states_5.contiguous() + attn_output_25 = torch._C._nn.scaled_dot_product_attention( + query_5, + key_5, + value_5, + attn_mask=attention_mask_6, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query_5 = key_5 = value_5 = attention_mask_6 = None + transpose_12 = attn_output_25.transpose(1, 2) + attn_output_25 = None + attn_output_26 = transpose_12.contiguous() + transpose_12 = None + reshape_5 = attn_output_26.reshape(1, 2, -1) + attn_output_26 = None + attn_output_27 = reshape_5.contiguous() + reshape_5 = None + attn_output_28 = torch._C._nn.linear( + attn_output_27, + l_self_modules_layers_modules_5_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_5_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_27 = l_self_modules_layers_modules_5_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_5_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_29 = torch.nn.functional.dropout(attn_output_28, 0.0, False, False) + attn_output_28 = None + layer_norm_11 = torch.nn.functional.layer_norm( + hidden_states_20, + (2048,), + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_21 = torch._C._nn.linear( + layer_norm_11, + l_self_modules_layers_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_11 = l_self_modules_layers_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_22 = torch._C._nn.gelu(hidden_states_21) + hidden_states_21 = None + hidden_states_23 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_layers_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_22 = l_self_modules_layers_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output_5 = torch.nn.functional.dropout(hidden_states_23, 0.0, False, False) + hidden_states_23 = None + add_22 = mlp_output_5 + attn_output_29 + mlp_output_5 = attn_output_29 = None + hidden_states_24 = add_22 + hidden_states_20 + add_22 = hidden_states_20 = None + layer_norm_12 = torch.nn.functional.layer_norm( + hidden_states_24, + (2048,), + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_ + ) = None + linear_24 = torch._C._nn.linear( + layer_norm_12, + l_self_modules_layers_modules_6_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_6_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm_12 = l_self_modules_layers_modules_6_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_6_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_7 = linear_24.view((1, 2, -1, 768)) + linear_24 = None + qkv_6 = view_7.transpose(1, 2) + view_7 = None + chunk_6 = qkv_6.chunk(3, dim=-1) + qkv_6 = None + query_states_6 = chunk_6[0] + key_states_6 = chunk_6[1] + value_states_6 = chunk_6[2] + chunk_6 = None + cos_9 = cos_2.unsqueeze(1) + sin_9 = sin_2.unsqueeze(1) + q_rot_6 = query_states_6[(Ellipsis, slice(None, 64, None))] + q_pass_6 = query_states_6[(Ellipsis, slice(64, None, None))] + query_states_6 = None + k_rot_6 = key_states_6[(Ellipsis, slice(None, 64, None))] + k_pass_6 = key_states_6[(Ellipsis, slice(64, None, None))] + key_states_6 = None + mul_27 = q_rot_6 * cos_9 + x1_12 = q_rot_6[(Ellipsis, slice(None, 32, None))] + x2_12 = q_rot_6[(Ellipsis, slice(32, None, None))] + q_rot_6 = None + neg_12 = -x2_12 + x2_12 = None + cat_25 = torch.cat((neg_12, x1_12), dim=-1) + neg_12 = x1_12 = None + mul_28 = cat_25 * sin_9 + cat_25 = None + q_embed_12 = mul_27 + mul_28 + mul_27 = mul_28 = None + mul_29 = k_rot_6 * cos_9 + cos_9 = None + x1_13 = k_rot_6[(Ellipsis, slice(None, 32, None))] + x2_13 = k_rot_6[(Ellipsis, slice(32, None, None))] + k_rot_6 = None + neg_13 = -x2_13 + x2_13 = None + cat_26 = torch.cat((neg_13, x1_13), dim=-1) + neg_13 = x1_13 = None + mul_30 = cat_26 * sin_9 + cat_26 = sin_9 = None + k_embed_12 = mul_29 + mul_30 + mul_29 = mul_30 = None + q_embed_13 = torch.cat([q_embed_12, q_pass_6], dim=-1) + q_embed_12 = q_pass_6 = None + k_embed_13 = torch.cat([k_embed_12, k_pass_6], dim=-1) + k_embed_12 = k_pass_6 = None + attention_mask_7 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_6 = q_embed_13.contiguous() + q_embed_13 = None + key_6 = k_embed_13.contiguous() + value_6 = value_states_6.contiguous() + attn_output_30 = torch._C._nn.scaled_dot_product_attention( + query_6, + key_6, + value_6, + attn_mask=attention_mask_7, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query_6 = key_6 = value_6 = attention_mask_7 = None + transpose_14 = attn_output_30.transpose(1, 2) + attn_output_30 = None + attn_output_31 = transpose_14.contiguous() + transpose_14 = None + reshape_6 = attn_output_31.reshape(1, 2, -1) + attn_output_31 = None + attn_output_32 = reshape_6.contiguous() + reshape_6 = None + attn_output_33 = torch._C._nn.linear( + attn_output_32, + l_self_modules_layers_modules_6_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_6_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_32 = l_self_modules_layers_modules_6_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_6_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_34 = torch.nn.functional.dropout(attn_output_33, 0.0, False, False) + attn_output_33 = None + layer_norm_13 = torch.nn.functional.layer_norm( + hidden_states_24, + (2048,), + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_25 = torch._C._nn.linear( + layer_norm_13, + l_self_modules_layers_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_13 = l_self_modules_layers_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_26 = torch._C._nn.gelu(hidden_states_25) + hidden_states_25 = None + hidden_states_27 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_layers_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_26 = l_self_modules_layers_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output_6 = torch.nn.functional.dropout(hidden_states_27, 0.0, False, False) + hidden_states_27 = None + add_26 = mlp_output_6 + attn_output_34 + mlp_output_6 = attn_output_34 = None + hidden_states_28 = add_26 + hidden_states_24 + add_26 = hidden_states_24 = None + layer_norm_14 = torch.nn.functional.layer_norm( + hidden_states_28, + (2048,), + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_ + ) = None + linear_28 = torch._C._nn.linear( + layer_norm_14, + l_self_modules_layers_modules_7_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_7_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm_14 = l_self_modules_layers_modules_7_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_7_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_8 = linear_28.view((1, 2, -1, 768)) + linear_28 = None + qkv_7 = view_8.transpose(1, 2) + view_8 = None + chunk_7 = qkv_7.chunk(3, dim=-1) + qkv_7 = None + query_states_7 = chunk_7[0] + key_states_7 = chunk_7[1] + value_states_7 = chunk_7[2] + chunk_7 = None + cos_10 = cos_2.unsqueeze(1) + sin_10 = sin_2.unsqueeze(1) + q_rot_7 = query_states_7[(Ellipsis, slice(None, 64, None))] + q_pass_7 = query_states_7[(Ellipsis, slice(64, None, None))] + query_states_7 = None + k_rot_7 = key_states_7[(Ellipsis, slice(None, 64, None))] + k_pass_7 = key_states_7[(Ellipsis, slice(64, None, None))] + key_states_7 = None + mul_31 = q_rot_7 * cos_10 + x1_14 = q_rot_7[(Ellipsis, slice(None, 32, None))] + x2_14 = q_rot_7[(Ellipsis, slice(32, None, None))] + q_rot_7 = None + neg_14 = -x2_14 + x2_14 = None + cat_29 = torch.cat((neg_14, x1_14), dim=-1) + neg_14 = x1_14 = None + mul_32 = cat_29 * sin_10 + cat_29 = None + q_embed_14 = mul_31 + mul_32 + mul_31 = mul_32 = None + mul_33 = k_rot_7 * cos_10 + cos_10 = None + x1_15 = k_rot_7[(Ellipsis, slice(None, 32, None))] + x2_15 = k_rot_7[(Ellipsis, slice(32, None, None))] + k_rot_7 = None + neg_15 = -x2_15 + x2_15 = None + cat_30 = torch.cat((neg_15, x1_15), dim=-1) + neg_15 = x1_15 = None + mul_34 = cat_30 * sin_10 + cat_30 = sin_10 = None + k_embed_14 = mul_33 + mul_34 + mul_33 = mul_34 = None + q_embed_15 = torch.cat([q_embed_14, q_pass_7], dim=-1) + q_embed_14 = q_pass_7 = None + k_embed_15 = torch.cat([k_embed_14, k_pass_7], dim=-1) + k_embed_14 = k_pass_7 = None + attention_mask_8 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_7 = q_embed_15.contiguous() + q_embed_15 = None + key_7 = k_embed_15.contiguous() + value_7 = value_states_7.contiguous() + attn_output_35 = torch._C._nn.scaled_dot_product_attention( + query_7, + key_7, + value_7, + attn_mask=attention_mask_8, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query_7 = key_7 = value_7 = attention_mask_8 = None + transpose_16 = attn_output_35.transpose(1, 2) + attn_output_35 = None + attn_output_36 = transpose_16.contiguous() + transpose_16 = None + reshape_7 = attn_output_36.reshape(1, 2, -1) + attn_output_36 = None + attn_output_37 = reshape_7.contiguous() + reshape_7 = None + attn_output_38 = torch._C._nn.linear( + attn_output_37, + l_self_modules_layers_modules_7_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_7_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_37 = l_self_modules_layers_modules_7_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_7_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_39 = torch.nn.functional.dropout(attn_output_38, 0.0, False, False) + attn_output_38 = None + layer_norm_15 = torch.nn.functional.layer_norm( + hidden_states_28, + (2048,), + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_29 = torch._C._nn.linear( + layer_norm_15, + l_self_modules_layers_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_15 = l_self_modules_layers_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_30 = torch._C._nn.gelu(hidden_states_29) + hidden_states_29 = None + hidden_states_31 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_layers_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_30 = l_self_modules_layers_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output_7 = torch.nn.functional.dropout(hidden_states_31, 0.0, False, False) + hidden_states_31 = None + add_30 = mlp_output_7 + attn_output_39 + mlp_output_7 = attn_output_39 = None + hidden_states_32 = add_30 + hidden_states_28 + add_30 = hidden_states_28 = None + layer_norm_16 = torch.nn.functional.layer_norm( + hidden_states_32, + (2048,), + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_ + ) = None + linear_32 = torch._C._nn.linear( + layer_norm_16, + l_self_modules_layers_modules_8_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_8_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm_16 = l_self_modules_layers_modules_8_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_8_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_9 = linear_32.view((1, 2, -1, 768)) + linear_32 = None + qkv_8 = view_9.transpose(1, 2) + view_9 = None + chunk_8 = qkv_8.chunk(3, dim=-1) + qkv_8 = None + query_states_8 = chunk_8[0] + key_states_8 = chunk_8[1] + value_states_8 = chunk_8[2] + chunk_8 = None + cos_11 = cos_2.unsqueeze(1) + sin_11 = sin_2.unsqueeze(1) + q_rot_8 = query_states_8[(Ellipsis, slice(None, 64, None))] + q_pass_8 = query_states_8[(Ellipsis, slice(64, None, None))] + query_states_8 = None + k_rot_8 = key_states_8[(Ellipsis, slice(None, 64, None))] + k_pass_8 = key_states_8[(Ellipsis, slice(64, None, None))] + key_states_8 = None + mul_35 = q_rot_8 * cos_11 + x1_16 = q_rot_8[(Ellipsis, slice(None, 32, None))] + x2_16 = q_rot_8[(Ellipsis, slice(32, None, None))] + q_rot_8 = None + neg_16 = -x2_16 + x2_16 = None + cat_33 = torch.cat((neg_16, x1_16), dim=-1) + neg_16 = x1_16 = None + mul_36 = cat_33 * sin_11 + cat_33 = None + q_embed_16 = mul_35 + mul_36 + mul_35 = mul_36 = None + mul_37 = k_rot_8 * cos_11 + cos_11 = None + x1_17 = k_rot_8[(Ellipsis, slice(None, 32, None))] + x2_17 = k_rot_8[(Ellipsis, slice(32, None, None))] + k_rot_8 = None + neg_17 = -x2_17 + x2_17 = None + cat_34 = torch.cat((neg_17, x1_17), dim=-1) + neg_17 = x1_17 = None + mul_38 = cat_34 * sin_11 + cat_34 = sin_11 = None + k_embed_16 = mul_37 + mul_38 + mul_37 = mul_38 = None + q_embed_17 = torch.cat([q_embed_16, q_pass_8], dim=-1) + q_embed_16 = q_pass_8 = None + k_embed_17 = torch.cat([k_embed_16, k_pass_8], dim=-1) + k_embed_16 = k_pass_8 = None + attention_mask_9 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_8 = q_embed_17.contiguous() + q_embed_17 = None + key_8 = k_embed_17.contiguous() + value_8 = value_states_8.contiguous() + attn_output_40 = torch._C._nn.scaled_dot_product_attention( + query_8, + key_8, + value_8, + attn_mask=attention_mask_9, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query_8 = key_8 = value_8 = attention_mask_9 = None + transpose_18 = attn_output_40.transpose(1, 2) + attn_output_40 = None + attn_output_41 = transpose_18.contiguous() + transpose_18 = None + reshape_8 = attn_output_41.reshape(1, 2, -1) + attn_output_41 = None + attn_output_42 = reshape_8.contiguous() + reshape_8 = None + attn_output_43 = torch._C._nn.linear( + attn_output_42, + l_self_modules_layers_modules_8_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_8_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_42 = l_self_modules_layers_modules_8_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_8_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_44 = torch.nn.functional.dropout(attn_output_43, 0.0, False, False) + attn_output_43 = None + layer_norm_17 = torch.nn.functional.layer_norm( + hidden_states_32, + (2048,), + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_33 = torch._C._nn.linear( + layer_norm_17, + l_self_modules_layers_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_17 = l_self_modules_layers_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_34 = torch._C._nn.gelu(hidden_states_33) + hidden_states_33 = None + hidden_states_35 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_layers_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_34 = l_self_modules_layers_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output_8 = torch.nn.functional.dropout(hidden_states_35, 0.0, False, False) + hidden_states_35 = None + add_34 = mlp_output_8 + attn_output_44 + mlp_output_8 = attn_output_44 = None + hidden_states_36 = add_34 + hidden_states_32 + add_34 = hidden_states_32 = None + layer_norm_18 = torch.nn.functional.layer_norm( + hidden_states_36, + (2048,), + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_ + ) = None + linear_36 = torch._C._nn.linear( + layer_norm_18, + l_self_modules_layers_modules_9_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_9_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm_18 = l_self_modules_layers_modules_9_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_9_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_10 = linear_36.view((1, 2, -1, 768)) + linear_36 = None + qkv_9 = view_10.transpose(1, 2) + view_10 = None + chunk_9 = qkv_9.chunk(3, dim=-1) + qkv_9 = None + query_states_9 = chunk_9[0] + key_states_9 = chunk_9[1] + value_states_9 = chunk_9[2] + chunk_9 = None + cos_12 = cos_2.unsqueeze(1) + sin_12 = sin_2.unsqueeze(1) + q_rot_9 = query_states_9[(Ellipsis, slice(None, 64, None))] + q_pass_9 = query_states_9[(Ellipsis, slice(64, None, None))] + query_states_9 = None + k_rot_9 = key_states_9[(Ellipsis, slice(None, 64, None))] + k_pass_9 = key_states_9[(Ellipsis, slice(64, None, None))] + key_states_9 = None + mul_39 = q_rot_9 * cos_12 + x1_18 = q_rot_9[(Ellipsis, slice(None, 32, None))] + x2_18 = q_rot_9[(Ellipsis, slice(32, None, None))] + q_rot_9 = None + neg_18 = -x2_18 + x2_18 = None + cat_37 = torch.cat((neg_18, x1_18), dim=-1) + neg_18 = x1_18 = None + mul_40 = cat_37 * sin_12 + cat_37 = None + q_embed_18 = mul_39 + mul_40 + mul_39 = mul_40 = None + mul_41 = k_rot_9 * cos_12 + cos_12 = None + x1_19 = k_rot_9[(Ellipsis, slice(None, 32, None))] + x2_19 = k_rot_9[(Ellipsis, slice(32, None, None))] + k_rot_9 = None + neg_19 = -x2_19 + x2_19 = None + cat_38 = torch.cat((neg_19, x1_19), dim=-1) + neg_19 = x1_19 = None + mul_42 = cat_38 * sin_12 + cat_38 = sin_12 = None + k_embed_18 = mul_41 + mul_42 + mul_41 = mul_42 = None + q_embed_19 = torch.cat([q_embed_18, q_pass_9], dim=-1) + q_embed_18 = q_pass_9 = None + k_embed_19 = torch.cat([k_embed_18, k_pass_9], dim=-1) + k_embed_18 = k_pass_9 = None + attention_mask_10 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_9 = q_embed_19.contiguous() + q_embed_19 = None + key_9 = k_embed_19.contiguous() + value_9 = value_states_9.contiguous() + attn_output_45 = torch._C._nn.scaled_dot_product_attention( + query_9, + key_9, + value_9, + attn_mask=attention_mask_10, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query_9 = key_9 = value_9 = attention_mask_10 = None + transpose_20 = attn_output_45.transpose(1, 2) + attn_output_45 = None + attn_output_46 = transpose_20.contiguous() + transpose_20 = None + reshape_9 = attn_output_46.reshape(1, 2, -1) + attn_output_46 = None + attn_output_47 = reshape_9.contiguous() + reshape_9 = None + attn_output_48 = torch._C._nn.linear( + attn_output_47, + l_self_modules_layers_modules_9_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_9_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_47 = l_self_modules_layers_modules_9_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_9_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_49 = torch.nn.functional.dropout(attn_output_48, 0.0, False, False) + attn_output_48 = None + layer_norm_19 = torch.nn.functional.layer_norm( + hidden_states_36, + (2048,), + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_37 = torch._C._nn.linear( + layer_norm_19, + l_self_modules_layers_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_19 = l_self_modules_layers_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_38 = torch._C._nn.gelu(hidden_states_37) + hidden_states_37 = None + hidden_states_39 = torch._C._nn.linear( + hidden_states_38, + l_self_modules_layers_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_38 = l_self_modules_layers_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output_9 = torch.nn.functional.dropout(hidden_states_39, 0.0, False, False) + hidden_states_39 = None + add_38 = mlp_output_9 + attn_output_49 + mlp_output_9 = attn_output_49 = None + hidden_states_40 = add_38 + hidden_states_36 + add_38 = hidden_states_36 = None + layer_norm_20 = torch.nn.functional.layer_norm( + hidden_states_40, + (2048,), + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_ + ) = None + linear_40 = torch._C._nn.linear( + layer_norm_20, + l_self_modules_layers_modules_10_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_10_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm_20 = l_self_modules_layers_modules_10_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_10_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_11 = linear_40.view((1, 2, -1, 768)) + linear_40 = None + qkv_10 = view_11.transpose(1, 2) + view_11 = None + chunk_10 = qkv_10.chunk(3, dim=-1) + qkv_10 = None + query_states_10 = chunk_10[0] + key_states_10 = chunk_10[1] + value_states_10 = chunk_10[2] + chunk_10 = None + cos_13 = cos_2.unsqueeze(1) + sin_13 = sin_2.unsqueeze(1) + q_rot_10 = query_states_10[(Ellipsis, slice(None, 64, None))] + q_pass_10 = query_states_10[(Ellipsis, slice(64, None, None))] + query_states_10 = None + k_rot_10 = key_states_10[(Ellipsis, slice(None, 64, None))] + k_pass_10 = key_states_10[(Ellipsis, slice(64, None, None))] + key_states_10 = None + mul_43 = q_rot_10 * cos_13 + x1_20 = q_rot_10[(Ellipsis, slice(None, 32, None))] + x2_20 = q_rot_10[(Ellipsis, slice(32, None, None))] + q_rot_10 = None + neg_20 = -x2_20 + x2_20 = None + cat_41 = torch.cat((neg_20, x1_20), dim=-1) + neg_20 = x1_20 = None + mul_44 = cat_41 * sin_13 + cat_41 = None + q_embed_20 = mul_43 + mul_44 + mul_43 = mul_44 = None + mul_45 = k_rot_10 * cos_13 + cos_13 = None + x1_21 = k_rot_10[(Ellipsis, slice(None, 32, None))] + x2_21 = k_rot_10[(Ellipsis, slice(32, None, None))] + k_rot_10 = None + neg_21 = -x2_21 + x2_21 = None + cat_42 = torch.cat((neg_21, x1_21), dim=-1) + neg_21 = x1_21 = None + mul_46 = cat_42 * sin_13 + cat_42 = sin_13 = None + k_embed_20 = mul_45 + mul_46 + mul_45 = mul_46 = None + q_embed_21 = torch.cat([q_embed_20, q_pass_10], dim=-1) + q_embed_20 = q_pass_10 = None + k_embed_21 = torch.cat([k_embed_20, k_pass_10], dim=-1) + k_embed_20 = k_pass_10 = None + attention_mask_11 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_10 = q_embed_21.contiguous() + q_embed_21 = None + key_10 = k_embed_21.contiguous() + value_10 = value_states_10.contiguous() + attn_output_50 = torch._C._nn.scaled_dot_product_attention( + query_10, + key_10, + value_10, + attn_mask=attention_mask_11, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query_10 = key_10 = value_10 = attention_mask_11 = None + transpose_22 = attn_output_50.transpose(1, 2) + attn_output_50 = None + attn_output_51 = transpose_22.contiguous() + transpose_22 = None + reshape_10 = attn_output_51.reshape(1, 2, -1) + attn_output_51 = None + attn_output_52 = reshape_10.contiguous() + reshape_10 = None + attn_output_53 = torch._C._nn.linear( + attn_output_52, + l_self_modules_layers_modules_10_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_10_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_52 = l_self_modules_layers_modules_10_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_10_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_54 = torch.nn.functional.dropout(attn_output_53, 0.0, False, False) + attn_output_53 = None + layer_norm_21 = torch.nn.functional.layer_norm( + hidden_states_40, + (2048,), + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_41 = torch._C._nn.linear( + layer_norm_21, + l_self_modules_layers_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_21 = l_self_modules_layers_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_42 = torch._C._nn.gelu(hidden_states_41) + hidden_states_41 = None + hidden_states_43 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_layers_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_42 = l_self_modules_layers_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output_10 = torch.nn.functional.dropout(hidden_states_43, 0.0, False, False) + hidden_states_43 = None + add_42 = mlp_output_10 + attn_output_54 + mlp_output_10 = attn_output_54 = None + hidden_states_44 = add_42 + hidden_states_40 + add_42 = hidden_states_40 = None + layer_norm_22 = torch.nn.functional.layer_norm( + hidden_states_44, + (2048,), + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_ + ) = None + linear_44 = torch._C._nn.linear( + layer_norm_22, + l_self_modules_layers_modules_11_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_11_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm_22 = l_self_modules_layers_modules_11_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_11_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_12 = linear_44.view((1, 2, -1, 768)) + linear_44 = None + qkv_11 = view_12.transpose(1, 2) + view_12 = None + chunk_11 = qkv_11.chunk(3, dim=-1) + qkv_11 = None + query_states_11 = chunk_11[0] + key_states_11 = chunk_11[1] + value_states_11 = chunk_11[2] + chunk_11 = None + cos_14 = cos_2.unsqueeze(1) + sin_14 = sin_2.unsqueeze(1) + q_rot_11 = query_states_11[(Ellipsis, slice(None, 64, None))] + q_pass_11 = query_states_11[(Ellipsis, slice(64, None, None))] + query_states_11 = None + k_rot_11 = key_states_11[(Ellipsis, slice(None, 64, None))] + k_pass_11 = key_states_11[(Ellipsis, slice(64, None, None))] + key_states_11 = None + mul_47 = q_rot_11 * cos_14 + x1_22 = q_rot_11[(Ellipsis, slice(None, 32, None))] + x2_22 = q_rot_11[(Ellipsis, slice(32, None, None))] + q_rot_11 = None + neg_22 = -x2_22 + x2_22 = None + cat_45 = torch.cat((neg_22, x1_22), dim=-1) + neg_22 = x1_22 = None + mul_48 = cat_45 * sin_14 + cat_45 = None + q_embed_22 = mul_47 + mul_48 + mul_47 = mul_48 = None + mul_49 = k_rot_11 * cos_14 + cos_14 = None + x1_23 = k_rot_11[(Ellipsis, slice(None, 32, None))] + x2_23 = k_rot_11[(Ellipsis, slice(32, None, None))] + k_rot_11 = None + neg_23 = -x2_23 + x2_23 = None + cat_46 = torch.cat((neg_23, x1_23), dim=-1) + neg_23 = x1_23 = None + mul_50 = cat_46 * sin_14 + cat_46 = sin_14 = None + k_embed_22 = mul_49 + mul_50 + mul_49 = mul_50 = None + q_embed_23 = torch.cat([q_embed_22, q_pass_11], dim=-1) + q_embed_22 = q_pass_11 = None + k_embed_23 = torch.cat([k_embed_22, k_pass_11], dim=-1) + k_embed_22 = k_pass_11 = None + attention_mask_12 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_11 = q_embed_23.contiguous() + q_embed_23 = None + key_11 = k_embed_23.contiguous() + value_11 = value_states_11.contiguous() + attn_output_55 = torch._C._nn.scaled_dot_product_attention( + query_11, + key_11, + value_11, + attn_mask=attention_mask_12, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query_11 = key_11 = value_11 = attention_mask_12 = None + transpose_24 = attn_output_55.transpose(1, 2) + attn_output_55 = None + attn_output_56 = transpose_24.contiguous() + transpose_24 = None + reshape_11 = attn_output_56.reshape(1, 2, -1) + attn_output_56 = None + attn_output_57 = reshape_11.contiguous() + reshape_11 = None + attn_output_58 = torch._C._nn.linear( + attn_output_57, + l_self_modules_layers_modules_11_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_11_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_57 = l_self_modules_layers_modules_11_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_11_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_59 = torch.nn.functional.dropout(attn_output_58, 0.0, False, False) + attn_output_58 = None + layer_norm_23 = torch.nn.functional.layer_norm( + hidden_states_44, + (2048,), + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_45 = torch._C._nn.linear( + layer_norm_23, + l_self_modules_layers_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_23 = l_self_modules_layers_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_46 = torch._C._nn.gelu(hidden_states_45) + hidden_states_45 = None + hidden_states_47 = torch._C._nn.linear( + hidden_states_46, + l_self_modules_layers_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_46 = l_self_modules_layers_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output_11 = torch.nn.functional.dropout(hidden_states_47, 0.0, False, False) + hidden_states_47 = None + add_46 = mlp_output_11 + attn_output_59 + mlp_output_11 = attn_output_59 = None + hidden_states_48 = add_46 + hidden_states_44 + add_46 = hidden_states_44 = None + layer_norm_24 = torch.nn.functional.layer_norm( + hidden_states_48, + (2048,), + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_ + ) = None + linear_48 = torch._C._nn.linear( + layer_norm_24, + l_self_modules_layers_modules_12_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_12_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm_24 = l_self_modules_layers_modules_12_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_12_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_13 = linear_48.view((1, 2, -1, 768)) + linear_48 = None + qkv_12 = view_13.transpose(1, 2) + view_13 = None + chunk_12 = qkv_12.chunk(3, dim=-1) + qkv_12 = None + query_states_12 = chunk_12[0] + key_states_12 = chunk_12[1] + value_states_12 = chunk_12[2] + chunk_12 = None + cos_15 = cos_2.unsqueeze(1) + sin_15 = sin_2.unsqueeze(1) + q_rot_12 = query_states_12[(Ellipsis, slice(None, 64, None))] + q_pass_12 = query_states_12[(Ellipsis, slice(64, None, None))] + query_states_12 = None + k_rot_12 = key_states_12[(Ellipsis, slice(None, 64, None))] + k_pass_12 = key_states_12[(Ellipsis, slice(64, None, None))] + key_states_12 = None + mul_51 = q_rot_12 * cos_15 + x1_24 = q_rot_12[(Ellipsis, slice(None, 32, None))] + x2_24 = q_rot_12[(Ellipsis, slice(32, None, None))] + q_rot_12 = None + neg_24 = -x2_24 + x2_24 = None + cat_49 = torch.cat((neg_24, x1_24), dim=-1) + neg_24 = x1_24 = None + mul_52 = cat_49 * sin_15 + cat_49 = None + q_embed_24 = mul_51 + mul_52 + mul_51 = mul_52 = None + mul_53 = k_rot_12 * cos_15 + cos_15 = None + x1_25 = k_rot_12[(Ellipsis, slice(None, 32, None))] + x2_25 = k_rot_12[(Ellipsis, slice(32, None, None))] + k_rot_12 = None + neg_25 = -x2_25 + x2_25 = None + cat_50 = torch.cat((neg_25, x1_25), dim=-1) + neg_25 = x1_25 = None + mul_54 = cat_50 * sin_15 + cat_50 = sin_15 = None + k_embed_24 = mul_53 + mul_54 + mul_53 = mul_54 = None + q_embed_25 = torch.cat([q_embed_24, q_pass_12], dim=-1) + q_embed_24 = q_pass_12 = None + k_embed_25 = torch.cat([k_embed_24, k_pass_12], dim=-1) + k_embed_24 = k_pass_12 = None + attention_mask_13 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_12 = q_embed_25.contiguous() + q_embed_25 = None + key_12 = k_embed_25.contiguous() + value_12 = value_states_12.contiguous() + attn_output_60 = torch._C._nn.scaled_dot_product_attention( + query_12, + key_12, + value_12, + attn_mask=attention_mask_13, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query_12 = key_12 = value_12 = attention_mask_13 = None + transpose_26 = attn_output_60.transpose(1, 2) + attn_output_60 = None + attn_output_61 = transpose_26.contiguous() + transpose_26 = None + reshape_12 = attn_output_61.reshape(1, 2, -1) + attn_output_61 = None + attn_output_62 = reshape_12.contiguous() + reshape_12 = None + attn_output_63 = torch._C._nn.linear( + attn_output_62, + l_self_modules_layers_modules_12_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_12_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_62 = l_self_modules_layers_modules_12_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_12_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_64 = torch.nn.functional.dropout(attn_output_63, 0.0, False, False) + attn_output_63 = None + layer_norm_25 = torch.nn.functional.layer_norm( + hidden_states_48, + (2048,), + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_49 = torch._C._nn.linear( + layer_norm_25, + l_self_modules_layers_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_25 = l_self_modules_layers_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_50 = torch._C._nn.gelu(hidden_states_49) + hidden_states_49 = None + hidden_states_51 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_layers_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_50 = l_self_modules_layers_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output_12 = torch.nn.functional.dropout(hidden_states_51, 0.0, False, False) + hidden_states_51 = None + add_50 = mlp_output_12 + attn_output_64 + mlp_output_12 = attn_output_64 = None + hidden_states_52 = add_50 + hidden_states_48 + add_50 = hidden_states_48 = None + layer_norm_26 = torch.nn.functional.layer_norm( + hidden_states_52, + (2048,), + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_ + ) = None + linear_52 = torch._C._nn.linear( + layer_norm_26, + l_self_modules_layers_modules_13_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_13_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm_26 = l_self_modules_layers_modules_13_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_13_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_14 = linear_52.view((1, 2, -1, 768)) + linear_52 = None + qkv_13 = view_14.transpose(1, 2) + view_14 = None + chunk_13 = qkv_13.chunk(3, dim=-1) + qkv_13 = None + query_states_13 = chunk_13[0] + key_states_13 = chunk_13[1] + value_states_13 = chunk_13[2] + chunk_13 = None + cos_16 = cos_2.unsqueeze(1) + sin_16 = sin_2.unsqueeze(1) + q_rot_13 = query_states_13[(Ellipsis, slice(None, 64, None))] + q_pass_13 = query_states_13[(Ellipsis, slice(64, None, None))] + query_states_13 = None + k_rot_13 = key_states_13[(Ellipsis, slice(None, 64, None))] + k_pass_13 = key_states_13[(Ellipsis, slice(64, None, None))] + key_states_13 = None + mul_55 = q_rot_13 * cos_16 + x1_26 = q_rot_13[(Ellipsis, slice(None, 32, None))] + x2_26 = q_rot_13[(Ellipsis, slice(32, None, None))] + q_rot_13 = None + neg_26 = -x2_26 + x2_26 = None + cat_53 = torch.cat((neg_26, x1_26), dim=-1) + neg_26 = x1_26 = None + mul_56 = cat_53 * sin_16 + cat_53 = None + q_embed_26 = mul_55 + mul_56 + mul_55 = mul_56 = None + mul_57 = k_rot_13 * cos_16 + cos_16 = None + x1_27 = k_rot_13[(Ellipsis, slice(None, 32, None))] + x2_27 = k_rot_13[(Ellipsis, slice(32, None, None))] + k_rot_13 = None + neg_27 = -x2_27 + x2_27 = None + cat_54 = torch.cat((neg_27, x1_27), dim=-1) + neg_27 = x1_27 = None + mul_58 = cat_54 * sin_16 + cat_54 = sin_16 = None + k_embed_26 = mul_57 + mul_58 + mul_57 = mul_58 = None + q_embed_27 = torch.cat([q_embed_26, q_pass_13], dim=-1) + q_embed_26 = q_pass_13 = None + k_embed_27 = torch.cat([k_embed_26, k_pass_13], dim=-1) + k_embed_26 = k_pass_13 = None + attention_mask_14 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_13 = q_embed_27.contiguous() + q_embed_27 = None + key_13 = k_embed_27.contiguous() + value_13 = value_states_13.contiguous() + attn_output_65 = torch._C._nn.scaled_dot_product_attention( + query_13, + key_13, + value_13, + attn_mask=attention_mask_14, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query_13 = key_13 = value_13 = attention_mask_14 = None + transpose_28 = attn_output_65.transpose(1, 2) + attn_output_65 = None + attn_output_66 = transpose_28.contiguous() + transpose_28 = None + reshape_13 = attn_output_66.reshape(1, 2, -1) + attn_output_66 = None + attn_output_67 = reshape_13.contiguous() + reshape_13 = None + attn_output_68 = torch._C._nn.linear( + attn_output_67, + l_self_modules_layers_modules_13_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_13_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_67 = l_self_modules_layers_modules_13_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_13_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_69 = torch.nn.functional.dropout(attn_output_68, 0.0, False, False) + attn_output_68 = None + layer_norm_27 = torch.nn.functional.layer_norm( + hidden_states_52, + (2048,), + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_53 = torch._C._nn.linear( + layer_norm_27, + l_self_modules_layers_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_27 = l_self_modules_layers_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_54 = torch._C._nn.gelu(hidden_states_53) + hidden_states_53 = None + hidden_states_55 = torch._C._nn.linear( + hidden_states_54, + l_self_modules_layers_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_54 = l_self_modules_layers_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output_13 = torch.nn.functional.dropout(hidden_states_55, 0.0, False, False) + hidden_states_55 = None + add_54 = mlp_output_13 + attn_output_69 + mlp_output_13 = attn_output_69 = None + hidden_states_56 = add_54 + hidden_states_52 + add_54 = hidden_states_52 = None + layer_norm_28 = torch.nn.functional.layer_norm( + hidden_states_56, + (2048,), + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_ + ) = None + linear_56 = torch._C._nn.linear( + layer_norm_28, + l_self_modules_layers_modules_14_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_14_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm_28 = l_self_modules_layers_modules_14_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_14_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_15 = linear_56.view((1, 2, -1, 768)) + linear_56 = None + qkv_14 = view_15.transpose(1, 2) + view_15 = None + chunk_14 = qkv_14.chunk(3, dim=-1) + qkv_14 = None + query_states_14 = chunk_14[0] + key_states_14 = chunk_14[1] + value_states_14 = chunk_14[2] + chunk_14 = None + cos_17 = cos_2.unsqueeze(1) + sin_17 = sin_2.unsqueeze(1) + q_rot_14 = query_states_14[(Ellipsis, slice(None, 64, None))] + q_pass_14 = query_states_14[(Ellipsis, slice(64, None, None))] + query_states_14 = None + k_rot_14 = key_states_14[(Ellipsis, slice(None, 64, None))] + k_pass_14 = key_states_14[(Ellipsis, slice(64, None, None))] + key_states_14 = None + mul_59 = q_rot_14 * cos_17 + x1_28 = q_rot_14[(Ellipsis, slice(None, 32, None))] + x2_28 = q_rot_14[(Ellipsis, slice(32, None, None))] + q_rot_14 = None + neg_28 = -x2_28 + x2_28 = None + cat_57 = torch.cat((neg_28, x1_28), dim=-1) + neg_28 = x1_28 = None + mul_60 = cat_57 * sin_17 + cat_57 = None + q_embed_28 = mul_59 + mul_60 + mul_59 = mul_60 = None + mul_61 = k_rot_14 * cos_17 + cos_17 = None + x1_29 = k_rot_14[(Ellipsis, slice(None, 32, None))] + x2_29 = k_rot_14[(Ellipsis, slice(32, None, None))] + k_rot_14 = None + neg_29 = -x2_29 + x2_29 = None + cat_58 = torch.cat((neg_29, x1_29), dim=-1) + neg_29 = x1_29 = None + mul_62 = cat_58 * sin_17 + cat_58 = sin_17 = None + k_embed_28 = mul_61 + mul_62 + mul_61 = mul_62 = None + q_embed_29 = torch.cat([q_embed_28, q_pass_14], dim=-1) + q_embed_28 = q_pass_14 = None + k_embed_29 = torch.cat([k_embed_28, k_pass_14], dim=-1) + k_embed_28 = k_pass_14 = None + attention_mask_15 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_14 = q_embed_29.contiguous() + q_embed_29 = None + key_14 = k_embed_29.contiguous() + value_14 = value_states_14.contiguous() + attn_output_70 = torch._C._nn.scaled_dot_product_attention( + query_14, + key_14, + value_14, + attn_mask=attention_mask_15, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query_14 = key_14 = value_14 = attention_mask_15 = None + transpose_30 = attn_output_70.transpose(1, 2) + attn_output_70 = None + attn_output_71 = transpose_30.contiguous() + transpose_30 = None + reshape_14 = attn_output_71.reshape(1, 2, -1) + attn_output_71 = None + attn_output_72 = reshape_14.contiguous() + reshape_14 = None + attn_output_73 = torch._C._nn.linear( + attn_output_72, + l_self_modules_layers_modules_14_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_14_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_72 = l_self_modules_layers_modules_14_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_14_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_74 = torch.nn.functional.dropout(attn_output_73, 0.0, False, False) + attn_output_73 = None + layer_norm_29 = torch.nn.functional.layer_norm( + hidden_states_56, + (2048,), + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_57 = torch._C._nn.linear( + layer_norm_29, + l_self_modules_layers_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_29 = l_self_modules_layers_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_58 = torch._C._nn.gelu(hidden_states_57) + hidden_states_57 = None + hidden_states_59 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_layers_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_58 = l_self_modules_layers_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output_14 = torch.nn.functional.dropout(hidden_states_59, 0.0, False, False) + hidden_states_59 = None + add_58 = mlp_output_14 + attn_output_74 + mlp_output_14 = attn_output_74 = None + hidden_states_60 = add_58 + hidden_states_56 + add_58 = hidden_states_56 = None + layer_norm_30 = torch.nn.functional.layer_norm( + hidden_states_60, + (2048,), + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_, + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_ + ) = None + linear_60 = torch._C._nn.linear( + layer_norm_30, + l_self_modules_layers_modules_15_modules_attention_modules_query_key_value_parameters_weight_, + l_self_modules_layers_modules_15_modules_attention_modules_query_key_value_parameters_bias_, + ) + layer_norm_30 = l_self_modules_layers_modules_15_modules_attention_modules_query_key_value_parameters_weight_ = l_self_modules_layers_modules_15_modules_attention_modules_query_key_value_parameters_bias_ = (None) + view_16 = linear_60.view((1, 2, -1, 768)) + linear_60 = None + qkv_15 = view_16.transpose(1, 2) + view_16 = None + chunk_15 = qkv_15.chunk(3, dim=-1) + qkv_15 = None + query_states_15 = chunk_15[0] + key_states_15 = chunk_15[1] + value_states_15 = chunk_15[2] + chunk_15 = None + cos_18 = cos_2.unsqueeze(1) + cos_2 = None + sin_18 = sin_2.unsqueeze(1) + sin_2 = None + q_rot_15 = query_states_15[(Ellipsis, slice(None, 64, None))] + q_pass_15 = query_states_15[(Ellipsis, slice(64, None, None))] + query_states_15 = None + k_rot_15 = key_states_15[(Ellipsis, slice(None, 64, None))] + k_pass_15 = key_states_15[(Ellipsis, slice(64, None, None))] + key_states_15 = None + mul_63 = q_rot_15 * cos_18 + x1_30 = q_rot_15[(Ellipsis, slice(None, 32, None))] + x2_30 = q_rot_15[(Ellipsis, slice(32, None, None))] + q_rot_15 = None + neg_30 = -x2_30 + x2_30 = None + cat_61 = torch.cat((neg_30, x1_30), dim=-1) + neg_30 = x1_30 = None + mul_64 = cat_61 * sin_18 + cat_61 = None + q_embed_30 = mul_63 + mul_64 + mul_63 = mul_64 = None + mul_65 = k_rot_15 * cos_18 + cos_18 = None + x1_31 = k_rot_15[(Ellipsis, slice(None, 32, None))] + x2_31 = k_rot_15[(Ellipsis, slice(32, None, None))] + k_rot_15 = None + neg_31 = -x2_31 + x2_31 = None + cat_62 = torch.cat((neg_31, x1_31), dim=-1) + neg_31 = x1_31 = None + mul_66 = cat_62 * sin_18 + cat_62 = sin_18 = None + k_embed_30 = mul_65 + mul_66 + mul_65 = mul_66 = None + q_embed_31 = torch.cat([q_embed_30, q_pass_15], dim=-1) + q_embed_30 = q_pass_15 = None + k_embed_31 = torch.cat([k_embed_30, k_pass_15], dim=-1) + k_embed_30 = k_pass_15 = None + attention_mask_16 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + causal_mask_2 = None + query_15 = q_embed_31.contiguous() + q_embed_31 = None + key_15 = k_embed_31.contiguous() + value_15 = value_states_15.contiguous() + attn_output_75 = torch._C._nn.scaled_dot_product_attention( + query_15, + key_15, + value_15, + attn_mask=attention_mask_16, + dropout_p=0.0, + scale=0.0625, + is_causal=False, + ) + query_15 = key_15 = value_15 = attention_mask_16 = None + transpose_32 = attn_output_75.transpose(1, 2) + attn_output_75 = None + attn_output_76 = transpose_32.contiguous() + transpose_32 = None + reshape_15 = attn_output_76.reshape(1, 2, -1) + attn_output_76 = None + attn_output_77 = reshape_15.contiguous() + reshape_15 = None + attn_output_78 = torch._C._nn.linear( + attn_output_77, + l_self_modules_layers_modules_15_modules_attention_modules_dense_parameters_weight_, + l_self_modules_layers_modules_15_modules_attention_modules_dense_parameters_bias_, + ) + attn_output_77 = l_self_modules_layers_modules_15_modules_attention_modules_dense_parameters_weight_ = l_self_modules_layers_modules_15_modules_attention_modules_dense_parameters_bias_ = (None) + attn_output_79 = torch.nn.functional.dropout(attn_output_78, 0.0, False, False) + attn_output_78 = None + layer_norm_31 = torch.nn.functional.layer_norm( + hidden_states_60, + (2048,), + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_bias_ = (None) + hidden_states_61 = torch._C._nn.linear( + layer_norm_31, + l_self_modules_layers_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_layers_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layer_norm_31 = l_self_modules_layers_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_layers_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + hidden_states_62 = torch._C._nn.gelu(hidden_states_61) + hidden_states_61 = None + hidden_states_63 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_layers_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_layers_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_62 = l_self_modules_layers_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_layers_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + mlp_output_15 = torch.nn.functional.dropout(hidden_states_63, 0.0, False, False) + hidden_states_63 = None + add_62 = mlp_output_15 + attn_output_79 + mlp_output_15 = attn_output_79 = None + hidden_states_64 = add_62 + hidden_states_60 + add_62 = hidden_states_60 = None + hidden_states_65 = torch.nn.functional.layer_norm( + hidden_states_64, + (2048,), + l_self_modules_final_layer_norm_parameters_weight_, + l_self_modules_final_layer_norm_parameters_bias_, + 1e-05, + ) + hidden_states_64 = ( + l_self_modules_final_layer_norm_parameters_weight_ + ) = l_self_modules_final_layer_norm_parameters_bias_ = None + return ( + value_states, + k_embed_1, + value_states_1, + k_embed_3, + value_states_2, + k_embed_5, + value_states_3, + k_embed_7, + value_states_4, + k_embed_9, + value_states_5, + k_embed_11, + value_states_6, + k_embed_13, + value_states_7, + k_embed_15, + value_states_8, + k_embed_17, + value_states_9, + k_embed_19, + value_states_10, + k_embed_21, + value_states_11, + k_embed_23, + value_states_12, + k_embed_25, + value_states_13, + k_embed_27, + value_states_14, + k_embed_29, + value_states_15, + k_embed_31, + hidden_states_65, + ) diff --git a/samples/transformers-auto-model/EleutherAI/pythia-1b/weight_meta.py b/samples/transformers-auto-model/EleutherAI/pythia-1b/weight_meta.py new file mode 100644 index 000000000..ff0d23726 --- /dev/null +++ b/samples/transformers-auto-model/EleutherAI/pythia-1b/weight_meta.py @@ -0,0 +1,2001 @@ +class Program_weight_tensor_meta_L_inputs_embeds_: + name = "L_inputs_embeds_" + shape = [1, 2, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 2] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1] + + +class Program_weight_tensor_meta_L_self_modules_rotary_emb_buffers_inv_freq_: + name = "L_self_modules_rotary_emb_buffers_inv_freq_" + shape = [32] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.125 + std = 0.240 + data = [ + 1.000000, + 0.749894, + 0.562341, + 0.421697, + 0.316228, + 0.237137, + 0.177828, + 0.133352, + 0.100000, + 0.074989, + 0.056234, + 0.042170, + 0.031623, + 0.023714, + 0.017783, + 0.013335, + 0.010000, + 0.007499, + 0.005623, + 0.004217, + 0.003162, + 0.002371, + 0.001778, + 0.001334, + 0.001000, + 0.000750, + 0.000562, + 0.000422, + 0.000316, + 0.000237, + 0.000178, + 0.000133, + ] + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_input_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_attention_modules_query_key_value_parameters_weight_" + shape = [6144, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_attention_modules_query_key_value_parameters_bias_" + shape = [6144] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_attention_modules_dense_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_attention_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [8192, 2048] + dtype = "torch.float16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + shape = [8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [2048, 8192] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_layers_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = "L_self_modules_layers_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_final_layer_norm_parameters_weight_: + name = "L_self_modules_final_layer_norm_parameters_weight_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_final_layer_norm_parameters_bias_: + name = "L_self_modules_final_layer_norm_parameters_bias_" + shape = [2048] + dtype = "torch.float16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/graph_hash.txt b/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/graph_hash.txt new file mode 100644 index 000000000..57d2a164a --- /dev/null +++ b/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/graph_hash.txt @@ -0,0 +1 @@ +5bb3cf1c0350744ee87f3932d817e762683f656a3e8e6338aa8592a415356a53 \ No newline at end of file diff --git a/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/graph_net.json b/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/input_meta.py b/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/input_tensor_constraints.py b/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/model.py b/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/model.py new file mode 100644 index 000000000..a28a29830 --- /dev/null +++ b/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/model.py @@ -0,0 +1,7500 @@ +import torch + +from torch import device + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_kwargs_input_ids_: torch.Tensor, + L_self_modules_model_modules_model_modules_embed_tokens_parameters_weight_: torch.nn.parameter.Parameter, + L_kwargs_attention_mask_: torch.Tensor, + L_self_modules_model_modules_model_modules_rotary_emb_buffers_inv_freq_: torch.Tensor, + L_self_modules_model_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_24_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_25_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_26_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_27_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_28_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_29_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_30_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_31_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_32_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_32_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_33_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_33_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_34_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_34_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_35_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_35_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_model_modules_norm_parameters_weight_: torch.nn.parameter.Parameter, + ): + l_kwargs_input_ids_ = L_kwargs_input_ids_ + l_self_modules_model_modules_model_modules_embed_tokens_parameters_weight_ = ( + L_self_modules_model_modules_model_modules_embed_tokens_parameters_weight_ + ) + l_kwargs_attention_mask_ = L_kwargs_attention_mask_ + l_self_modules_model_modules_model_modules_rotary_emb_buffers_inv_freq_ = ( + L_self_modules_model_modules_model_modules_rotary_emb_buffers_inv_freq_ + ) + l_self_modules_model_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_32_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_32_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_32_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_32_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_33_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_33_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_33_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_33_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_34_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_34_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_34_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_34_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_35_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_35_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_35_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_35_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_model_modules_norm_parameters_weight_ = ( + L_self_modules_model_modules_model_modules_norm_parameters_weight_ + ) + inputs_embeds = torch.nn.functional.embedding( + l_kwargs_input_ids_, + l_self_modules_model_modules_model_modules_embed_tokens_parameters_weight_, + 128004, + None, + 2.0, + False, + False, + ) + l_kwargs_input_ids_ = None + cache_position = torch.arange(0, 2, device=device(type="cuda", index=0)) + position_ids = cache_position.unsqueeze(0) + attention_mask = l_kwargs_attention_mask_.to( + device=device(type="cuda", index=0), dtype=torch.bool + ) + l_kwargs_attention_mask_ = None + mask_indices = torch.arange(2, device=device(type="cuda", index=0)) + mask_indices += 0 + mask_indices_1 = mask_indices + mask_indices = None + local_padding_mask = attention_mask[(slice(None, None, None), mask_indices_1)] + attention_mask = mask_indices_1 = None + kv_arange = torch.arange(2, device=device(type="cuda", index=0)) + kv_arange += 0 + kv_arange_1 = kv_arange + kv_arange = None + reshaped_cache_position = cache_position.view(-1, 1) + cache_position = None + causal_mask = kv_arange_1 <= reshaped_cache_position + kv_arange_1 = reshaped_cache_position = None + getitem_1 = causal_mask[ + (None, None, slice(None, None, None), slice(None, None, None)) + ] + causal_mask = None + causal_mask_1 = getitem_1.expand(1, -1, -1, -1) + getitem_1 = None + getitem_2 = local_padding_mask[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + local_padding_mask = None + causal_mask_2 = causal_mask_1 * getitem_2 + causal_mask_1 = getitem_2 = None + _set_grad_enabled = torch._C._set_grad_enabled(False) + _set_grad_enabled = None + getitem_3 = ( + l_self_modules_model_modules_model_modules_rotary_emb_buffers_inv_freq_[ + (None, slice(None, None, None), None) + ] + ) + l_self_modules_model_modules_model_modules_rotary_emb_buffers_inv_freq_ = None + float_1 = getitem_3.float() + getitem_3 = None + expand_1 = float_1.expand(1, -1, 1) + float_1 = None + inv_freq_expanded = expand_1.to(device(type="cuda", index=0)) + expand_1 = None + getitem_4 = position_ids[ + (slice(None, None, None), None, slice(None, None, None)) + ] + position_ids = None + position_ids_expanded = getitem_4.float() + getitem_4 = None + float_3 = inv_freq_expanded.float() + inv_freq_expanded = None + float_4 = position_ids_expanded.float() + position_ids_expanded = None + matmul = float_3 @ float_4 + float_3 = float_4 = None + freqs = matmul.transpose(1, 2) + matmul = None + emb = torch.cat((freqs, freqs), dim=-1) + freqs = None + cos = emb.cos() + cos_1 = cos * 1.0 + cos = None + sin = emb.sin() + emb = None + sin_1 = sin * 1.0 + sin = None + cos_2 = cos_1.to(dtype=torch.bfloat16) + cos_1 = None + sin_2 = sin_1.to(dtype=torch.bfloat16) + sin_1 = None + _set_grad_enabled_1 = torch._C._set_grad_enabled(True) + _set_grad_enabled_1 = None + _log_api_usage_once = torch._C._log_api_usage_once("python.nn_module") + _log_api_usage_once = None + hidden_states = inputs_embeds.to(torch.float32) + pow_1 = hidden_states.pow(2) + variance = pow_1.mean(-1, keepdim=True) + pow_1 = None + add = variance + 1e-06 + variance = None + rsqrt = torch.rsqrt(add) + add = None + hidden_states_1 = hidden_states * rsqrt + hidden_states = rsqrt = None + to_5 = hidden_states_1.to(torch.bfloat16) + hidden_states_1 = None + hidden_states_2 = ( + l_self_modules_model_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + * to_5 + ) + l_self_modules_model_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + to_5 + ) = None + linear = torch._C._nn.linear( + hidden_states_2, + l_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_1 = linear.view((1, 2, -1, 128)) + linear = None + query_states = view_1.transpose(1, 2) + view_1 = None + linear_1 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_2 = linear_1.view((1, 2, -1, 128)) + linear_1 = None + key_states = view_2.transpose(1, 2) + view_2 = None + linear_2 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_2 = l_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_3 = linear_2.view((1, 2, -1, 128)) + linear_2 = None + value_states = view_3.transpose(1, 2) + view_3 = None + cos_3 = cos_2.unsqueeze(1) + sin_3 = sin_2.unsqueeze(1) + mul_5 = query_states * cos_3 + x1 = query_states[(Ellipsis, slice(None, 64, None))] + x2 = query_states[(Ellipsis, slice(64, None, None))] + query_states = None + neg = -x2 + x2 = None + cat_1 = torch.cat((neg, x1), dim=-1) + neg = x1 = None + mul_6 = cat_1 * sin_3 + cat_1 = None + q_embed = mul_5 + mul_6 + mul_5 = mul_6 = None + mul_7 = key_states * cos_3 + cos_3 = None + x1_1 = key_states[(Ellipsis, slice(None, 64, None))] + x2_1 = key_states[(Ellipsis, slice(64, None, None))] + key_states = None + neg_1 = -x2_1 + x2_1 = None + cat_2 = torch.cat((neg_1, x1_1), dim=-1) + neg_1 = x1_1 = None + mul_8 = cat_2 * sin_3 + cat_2 = sin_3 = None + k_embed = mul_7 + mul_8 + mul_7 = mul_8 = None + getitem_9 = k_embed[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed = None + hidden_states_3 = getitem_9.expand(1, 4, 4, 2, 128) + getitem_9 = None + key = hidden_states_3.reshape(1, 16, 2, 128) + hidden_states_3 = None + getitem_10 = value_states[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states = None + hidden_states_4 = getitem_10.expand(1, 4, 4, 2, 128) + getitem_10 = None + value = hidden_states_4.reshape(1, 16, 2, 128) + hidden_states_4 = None + attention_mask_1 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query = q_embed.contiguous() + q_embed = None + key_1 = key.contiguous() + key = None + value_1 = value.contiguous() + value = None + attn_output = torch._C._nn.scaled_dot_product_attention( + query, + key_1, + value_1, + attn_mask=attention_mask_1, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query = key_1 = value_1 = attention_mask_1 = None + transpose_4 = attn_output.transpose(1, 2) + attn_output = None + attn_output_1 = transpose_4.contiguous() + transpose_4 = None + reshape_2 = attn_output_1.reshape(1, 2, -1) + attn_output_1 = None + attn_output_2 = reshape_2.contiguous() + reshape_2 = None + attn_output_3 = torch._C._nn.linear( + attn_output_2, + l_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_2 = l_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_5 = inputs_embeds + attn_output_3 + inputs_embeds = attn_output_3 = None + hidden_states_6 = hidden_states_5.to(torch.float32) + pow_2 = hidden_states_6.pow(2) + variance_1 = pow_2.mean(-1, keepdim=True) + pow_2 = None + add_4 = variance_1 + 1e-06 + variance_1 = None + rsqrt_1 = torch.rsqrt(add_4) + add_4 = None + hidden_states_7 = hidden_states_6 * rsqrt_1 + hidden_states_6 = rsqrt_1 = None + to_7 = hidden_states_7.to(torch.bfloat16) + hidden_states_7 = None + hidden_states_8 = ( + l_self_modules_model_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + * to_7 + ) + l_self_modules_model_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = ( + to_7 + ) = None + linear_4 = torch._C._nn.linear( + hidden_states_8, + l_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu = torch.nn.functional.silu(linear_4, inplace=False) + linear_4 = None + linear_5 = torch._C._nn.linear( + hidden_states_8, + l_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_8 = l_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_11 = silu * linear_5 + silu = linear_5 = None + down_proj = torch._C._nn.linear( + mul_11, + l_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_11 = l_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_9 = hidden_states_5 + down_proj + hidden_states_5 = down_proj = None + hidden_states_10 = hidden_states_9.to(torch.float32) + pow_3 = hidden_states_10.pow(2) + variance_2 = pow_3.mean(-1, keepdim=True) + pow_3 = None + add_6 = variance_2 + 1e-06 + variance_2 = None + rsqrt_2 = torch.rsqrt(add_6) + add_6 = None + hidden_states_11 = hidden_states_10 * rsqrt_2 + hidden_states_10 = rsqrt_2 = None + to_9 = hidden_states_11.to(torch.bfloat16) + hidden_states_11 = None + hidden_states_12 = ( + l_self_modules_model_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + * to_9 + ) + l_self_modules_model_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + to_9 + ) = None + linear_7 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_4 = linear_7.view((1, 2, -1, 128)) + linear_7 = None + query_states_1 = view_4.transpose(1, 2) + view_4 = None + linear_8 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_5 = linear_8.view((1, 2, -1, 128)) + linear_8 = None + key_states_1 = view_5.transpose(1, 2) + view_5 = None + linear_9 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_12 = l_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_6 = linear_9.view((1, 2, -1, 128)) + linear_9 = None + value_states_1 = view_6.transpose(1, 2) + view_6 = None + cos_4 = cos_2.unsqueeze(1) + sin_4 = sin_2.unsqueeze(1) + mul_14 = query_states_1 * cos_4 + x1_2 = query_states_1[(Ellipsis, slice(None, 64, None))] + x2_2 = query_states_1[(Ellipsis, slice(64, None, None))] + query_states_1 = None + neg_2 = -x2_2 + x2_2 = None + cat_3 = torch.cat((neg_2, x1_2), dim=-1) + neg_2 = x1_2 = None + mul_15 = cat_3 * sin_4 + cat_3 = None + q_embed_1 = mul_14 + mul_15 + mul_14 = mul_15 = None + mul_16 = key_states_1 * cos_4 + cos_4 = None + x1_3 = key_states_1[(Ellipsis, slice(None, 64, None))] + x2_3 = key_states_1[(Ellipsis, slice(64, None, None))] + key_states_1 = None + neg_3 = -x2_3 + x2_3 = None + cat_4 = torch.cat((neg_3, x1_3), dim=-1) + neg_3 = x1_3 = None + mul_17 = cat_4 * sin_4 + cat_4 = sin_4 = None + k_embed_1 = mul_16 + mul_17 + mul_16 = mul_17 = None + getitem_16 = k_embed_1[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_1 = None + hidden_states_13 = getitem_16.expand(1, 4, 4, 2, 128) + getitem_16 = None + key_2 = hidden_states_13.reshape(1, 16, 2, 128) + hidden_states_13 = None + getitem_17 = value_states_1[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_1 = None + hidden_states_14 = getitem_17.expand(1, 4, 4, 2, 128) + getitem_17 = None + value_2 = hidden_states_14.reshape(1, 16, 2, 128) + hidden_states_14 = None + attention_mask_2 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_1 = q_embed_1.contiguous() + q_embed_1 = None + key_3 = key_2.contiguous() + key_2 = None + value_3 = value_2.contiguous() + value_2 = None + attn_output_4 = torch._C._nn.scaled_dot_product_attention( + query_1, + key_3, + value_3, + attn_mask=attention_mask_2, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_1 = key_3 = value_3 = attention_mask_2 = None + transpose_8 = attn_output_4.transpose(1, 2) + attn_output_4 = None + attn_output_5 = transpose_8.contiguous() + transpose_8 = None + reshape_5 = attn_output_5.reshape(1, 2, -1) + attn_output_5 = None + attn_output_6 = reshape_5.contiguous() + reshape_5 = None + attn_output_7 = torch._C._nn.linear( + attn_output_6, + l_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_6 = l_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_15 = hidden_states_9 + attn_output_7 + hidden_states_9 = attn_output_7 = None + hidden_states_16 = hidden_states_15.to(torch.float32) + pow_4 = hidden_states_16.pow(2) + variance_3 = pow_4.mean(-1, keepdim=True) + pow_4 = None + add_10 = variance_3 + 1e-06 + variance_3 = None + rsqrt_3 = torch.rsqrt(add_10) + add_10 = None + hidden_states_17 = hidden_states_16 * rsqrt_3 + hidden_states_16 = rsqrt_3 = None + to_11 = hidden_states_17.to(torch.bfloat16) + hidden_states_17 = None + hidden_states_18 = ( + l_self_modules_model_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + * to_11 + ) + l_self_modules_model_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = ( + to_11 + ) = None + linear_11 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_1 = torch.nn.functional.silu(linear_11, inplace=False) + linear_11 = None + linear_12 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_18 = l_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_20 = silu_1 * linear_12 + silu_1 = linear_12 = None + down_proj_1 = torch._C._nn.linear( + mul_20, + l_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_20 = l_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_19 = hidden_states_15 + down_proj_1 + hidden_states_15 = down_proj_1 = None + hidden_states_20 = hidden_states_19.to(torch.float32) + pow_5 = hidden_states_20.pow(2) + variance_4 = pow_5.mean(-1, keepdim=True) + pow_5 = None + add_12 = variance_4 + 1e-06 + variance_4 = None + rsqrt_4 = torch.rsqrt(add_12) + add_12 = None + hidden_states_21 = hidden_states_20 * rsqrt_4 + hidden_states_20 = rsqrt_4 = None + to_13 = hidden_states_21.to(torch.bfloat16) + hidden_states_21 = None + hidden_states_22 = ( + l_self_modules_model_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + * to_13 + ) + l_self_modules_model_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + to_13 + ) = None + linear_14 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_7 = linear_14.view((1, 2, -1, 128)) + linear_14 = None + query_states_2 = view_7.transpose(1, 2) + view_7 = None + linear_15 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_8 = linear_15.view((1, 2, -1, 128)) + linear_15 = None + key_states_2 = view_8.transpose(1, 2) + view_8 = None + linear_16 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_22 = l_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_9 = linear_16.view((1, 2, -1, 128)) + linear_16 = None + value_states_2 = view_9.transpose(1, 2) + view_9 = None + cos_5 = cos_2.unsqueeze(1) + sin_5 = sin_2.unsqueeze(1) + mul_23 = query_states_2 * cos_5 + x1_4 = query_states_2[(Ellipsis, slice(None, 64, None))] + x2_4 = query_states_2[(Ellipsis, slice(64, None, None))] + query_states_2 = None + neg_4 = -x2_4 + x2_4 = None + cat_5 = torch.cat((neg_4, x1_4), dim=-1) + neg_4 = x1_4 = None + mul_24 = cat_5 * sin_5 + cat_5 = None + q_embed_2 = mul_23 + mul_24 + mul_23 = mul_24 = None + mul_25 = key_states_2 * cos_5 + cos_5 = None + x1_5 = key_states_2[(Ellipsis, slice(None, 64, None))] + x2_5 = key_states_2[(Ellipsis, slice(64, None, None))] + key_states_2 = None + neg_5 = -x2_5 + x2_5 = None + cat_6 = torch.cat((neg_5, x1_5), dim=-1) + neg_5 = x1_5 = None + mul_26 = cat_6 * sin_5 + cat_6 = sin_5 = None + k_embed_2 = mul_25 + mul_26 + mul_25 = mul_26 = None + getitem_23 = k_embed_2[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_2 = None + hidden_states_23 = getitem_23.expand(1, 4, 4, 2, 128) + getitem_23 = None + key_4 = hidden_states_23.reshape(1, 16, 2, 128) + hidden_states_23 = None + getitem_24 = value_states_2[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_2 = None + hidden_states_24 = getitem_24.expand(1, 4, 4, 2, 128) + getitem_24 = None + value_4 = hidden_states_24.reshape(1, 16, 2, 128) + hidden_states_24 = None + attention_mask_3 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_2 = q_embed_2.contiguous() + q_embed_2 = None + key_5 = key_4.contiguous() + key_4 = None + value_5 = value_4.contiguous() + value_4 = None + attn_output_8 = torch._C._nn.scaled_dot_product_attention( + query_2, + key_5, + value_5, + attn_mask=attention_mask_3, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_2 = key_5 = value_5 = attention_mask_3 = None + transpose_12 = attn_output_8.transpose(1, 2) + attn_output_8 = None + attn_output_9 = transpose_12.contiguous() + transpose_12 = None + reshape_8 = attn_output_9.reshape(1, 2, -1) + attn_output_9 = None + attn_output_10 = reshape_8.contiguous() + reshape_8 = None + attn_output_11 = torch._C._nn.linear( + attn_output_10, + l_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_10 = l_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_25 = hidden_states_19 + attn_output_11 + hidden_states_19 = attn_output_11 = None + hidden_states_26 = hidden_states_25.to(torch.float32) + pow_6 = hidden_states_26.pow(2) + variance_5 = pow_6.mean(-1, keepdim=True) + pow_6 = None + add_16 = variance_5 + 1e-06 + variance_5 = None + rsqrt_5 = torch.rsqrt(add_16) + add_16 = None + hidden_states_27 = hidden_states_26 * rsqrt_5 + hidden_states_26 = rsqrt_5 = None + to_15 = hidden_states_27.to(torch.bfloat16) + hidden_states_27 = None + hidden_states_28 = ( + l_self_modules_model_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + * to_15 + ) + l_self_modules_model_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = ( + to_15 + ) = None + linear_18 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_2 = torch.nn.functional.silu(linear_18, inplace=False) + linear_18 = None + linear_19 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_28 = l_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_29 = silu_2 * linear_19 + silu_2 = linear_19 = None + down_proj_2 = torch._C._nn.linear( + mul_29, + l_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_29 = l_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_29 = hidden_states_25 + down_proj_2 + hidden_states_25 = down_proj_2 = None + hidden_states_30 = hidden_states_29.to(torch.float32) + pow_7 = hidden_states_30.pow(2) + variance_6 = pow_7.mean(-1, keepdim=True) + pow_7 = None + add_18 = variance_6 + 1e-06 + variance_6 = None + rsqrt_6 = torch.rsqrt(add_18) + add_18 = None + hidden_states_31 = hidden_states_30 * rsqrt_6 + hidden_states_30 = rsqrt_6 = None + to_17 = hidden_states_31.to(torch.bfloat16) + hidden_states_31 = None + hidden_states_32 = ( + l_self_modules_model_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + * to_17 + ) + l_self_modules_model_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + to_17 + ) = None + linear_21 = torch._C._nn.linear( + hidden_states_32, + l_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_10 = linear_21.view((1, 2, -1, 128)) + linear_21 = None + query_states_3 = view_10.transpose(1, 2) + view_10 = None + linear_22 = torch._C._nn.linear( + hidden_states_32, + l_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_11 = linear_22.view((1, 2, -1, 128)) + linear_22 = None + key_states_3 = view_11.transpose(1, 2) + view_11 = None + linear_23 = torch._C._nn.linear( + hidden_states_32, + l_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_32 = l_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_12 = linear_23.view((1, 2, -1, 128)) + linear_23 = None + value_states_3 = view_12.transpose(1, 2) + view_12 = None + getitem_26 = key_states_3[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + key_states_3 = None + hidden_states_33 = getitem_26.expand(1, 4, 4, 2, 128) + getitem_26 = None + key_6 = hidden_states_33.reshape(1, 16, 2, 128) + hidden_states_33 = None + getitem_27 = value_states_3[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_3 = None + hidden_states_34 = getitem_27.expand(1, 4, 4, 2, 128) + getitem_27 = None + value_6 = hidden_states_34.reshape(1, 16, 2, 128) + hidden_states_34 = None + attention_mask_4 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_3 = query_states_3.contiguous() + query_states_3 = None + key_7 = key_6.contiguous() + key_6 = None + value_7 = value_6.contiguous() + value_6 = None + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_3, + key_7, + value_7, + attn_mask=attention_mask_4, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_3 = key_7 = value_7 = attention_mask_4 = None + transpose_16 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_13 = transpose_16.contiguous() + transpose_16 = None + reshape_11 = attn_output_13.reshape(1, 2, -1) + attn_output_13 = None + attn_output_14 = reshape_11.contiguous() + reshape_11 = None + attn_output_15 = torch._C._nn.linear( + attn_output_14, + l_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_14 = l_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_35 = hidden_states_29 + attn_output_15 + hidden_states_29 = attn_output_15 = None + hidden_states_36 = hidden_states_35.to(torch.float32) + pow_8 = hidden_states_36.pow(2) + variance_7 = pow_8.mean(-1, keepdim=True) + pow_8 = None + add_20 = variance_7 + 1e-06 + variance_7 = None + rsqrt_7 = torch.rsqrt(add_20) + add_20 = None + hidden_states_37 = hidden_states_36 * rsqrt_7 + hidden_states_36 = rsqrt_7 = None + to_19 = hidden_states_37.to(torch.bfloat16) + hidden_states_37 = None + hidden_states_38 = ( + l_self_modules_model_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + * to_19 + ) + l_self_modules_model_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = ( + to_19 + ) = None + linear_25 = torch._C._nn.linear( + hidden_states_38, + l_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_3 = torch.nn.functional.silu(linear_25, inplace=False) + linear_25 = None + linear_26 = torch._C._nn.linear( + hidden_states_38, + l_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_38 = l_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_34 = silu_3 * linear_26 + silu_3 = linear_26 = None + down_proj_3 = torch._C._nn.linear( + mul_34, + l_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_34 = l_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_39 = hidden_states_35 + down_proj_3 + hidden_states_35 = down_proj_3 = None + hidden_states_40 = hidden_states_39.to(torch.float32) + pow_9 = hidden_states_40.pow(2) + variance_8 = pow_9.mean(-1, keepdim=True) + pow_9 = None + add_22 = variance_8 + 1e-06 + variance_8 = None + rsqrt_8 = torch.rsqrt(add_22) + add_22 = None + hidden_states_41 = hidden_states_40 * rsqrt_8 + hidden_states_40 = rsqrt_8 = None + to_21 = hidden_states_41.to(torch.bfloat16) + hidden_states_41 = None + hidden_states_42 = ( + l_self_modules_model_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + * to_21 + ) + l_self_modules_model_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + to_21 + ) = None + linear_28 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_13 = linear_28.view((1, 2, -1, 128)) + linear_28 = None + query_states_4 = view_13.transpose(1, 2) + view_13 = None + linear_29 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_14 = linear_29.view((1, 2, -1, 128)) + linear_29 = None + key_states_4 = view_14.transpose(1, 2) + view_14 = None + linear_30 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_42 = l_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_15 = linear_30.view((1, 2, -1, 128)) + linear_30 = None + value_states_4 = view_15.transpose(1, 2) + view_15 = None + cos_6 = cos_2.unsqueeze(1) + sin_6 = sin_2.unsqueeze(1) + mul_37 = query_states_4 * cos_6 + x1_6 = query_states_4[(Ellipsis, slice(None, 64, None))] + x2_6 = query_states_4[(Ellipsis, slice(64, None, None))] + query_states_4 = None + neg_6 = -x2_6 + x2_6 = None + cat_7 = torch.cat((neg_6, x1_6), dim=-1) + neg_6 = x1_6 = None + mul_38 = cat_7 * sin_6 + cat_7 = None + q_embed_3 = mul_37 + mul_38 + mul_37 = mul_38 = None + mul_39 = key_states_4 * cos_6 + cos_6 = None + x1_7 = key_states_4[(Ellipsis, slice(None, 64, None))] + x2_7 = key_states_4[(Ellipsis, slice(64, None, None))] + key_states_4 = None + neg_7 = -x2_7 + x2_7 = None + cat_8 = torch.cat((neg_7, x1_7), dim=-1) + neg_7 = x1_7 = None + mul_40 = cat_8 * sin_6 + cat_8 = sin_6 = None + k_embed_3 = mul_39 + mul_40 + mul_39 = mul_40 = None + getitem_33 = k_embed_3[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_3 = None + hidden_states_43 = getitem_33.expand(1, 4, 4, 2, 128) + getitem_33 = None + key_8 = hidden_states_43.reshape(1, 16, 2, 128) + hidden_states_43 = None + getitem_34 = value_states_4[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_4 = None + hidden_states_44 = getitem_34.expand(1, 4, 4, 2, 128) + getitem_34 = None + value_8 = hidden_states_44.reshape(1, 16, 2, 128) + hidden_states_44 = None + attention_mask_5 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_4 = q_embed_3.contiguous() + q_embed_3 = None + key_9 = key_8.contiguous() + key_8 = None + value_9 = value_8.contiguous() + value_8 = None + attn_output_16 = torch._C._nn.scaled_dot_product_attention( + query_4, + key_9, + value_9, + attn_mask=attention_mask_5, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_4 = key_9 = value_9 = attention_mask_5 = None + transpose_20 = attn_output_16.transpose(1, 2) + attn_output_16 = None + attn_output_17 = transpose_20.contiguous() + transpose_20 = None + reshape_14 = attn_output_17.reshape(1, 2, -1) + attn_output_17 = None + attn_output_18 = reshape_14.contiguous() + reshape_14 = None + attn_output_19 = torch._C._nn.linear( + attn_output_18, + l_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_18 = l_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_45 = hidden_states_39 + attn_output_19 + hidden_states_39 = attn_output_19 = None + hidden_states_46 = hidden_states_45.to(torch.float32) + pow_10 = hidden_states_46.pow(2) + variance_9 = pow_10.mean(-1, keepdim=True) + pow_10 = None + add_26 = variance_9 + 1e-06 + variance_9 = None + rsqrt_9 = torch.rsqrt(add_26) + add_26 = None + hidden_states_47 = hidden_states_46 * rsqrt_9 + hidden_states_46 = rsqrt_9 = None + to_23 = hidden_states_47.to(torch.bfloat16) + hidden_states_47 = None + hidden_states_48 = ( + l_self_modules_model_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + * to_23 + ) + l_self_modules_model_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = ( + to_23 + ) = None + linear_32 = torch._C._nn.linear( + hidden_states_48, + l_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_4 = torch.nn.functional.silu(linear_32, inplace=False) + linear_32 = None + linear_33 = torch._C._nn.linear( + hidden_states_48, + l_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_48 = l_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_43 = silu_4 * linear_33 + silu_4 = linear_33 = None + down_proj_4 = torch._C._nn.linear( + mul_43, + l_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_43 = l_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_49 = hidden_states_45 + down_proj_4 + hidden_states_45 = down_proj_4 = None + hidden_states_50 = hidden_states_49.to(torch.float32) + pow_11 = hidden_states_50.pow(2) + variance_10 = pow_11.mean(-1, keepdim=True) + pow_11 = None + add_28 = variance_10 + 1e-06 + variance_10 = None + rsqrt_10 = torch.rsqrt(add_28) + add_28 = None + hidden_states_51 = hidden_states_50 * rsqrt_10 + hidden_states_50 = rsqrt_10 = None + to_25 = hidden_states_51.to(torch.bfloat16) + hidden_states_51 = None + hidden_states_52 = ( + l_self_modules_model_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + * to_25 + ) + l_self_modules_model_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + to_25 + ) = None + linear_35 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_16 = linear_35.view((1, 2, -1, 128)) + linear_35 = None + query_states_5 = view_16.transpose(1, 2) + view_16 = None + linear_36 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_17 = linear_36.view((1, 2, -1, 128)) + linear_36 = None + key_states_5 = view_17.transpose(1, 2) + view_17 = None + linear_37 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_52 = l_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_18 = linear_37.view((1, 2, -1, 128)) + linear_37 = None + value_states_5 = view_18.transpose(1, 2) + view_18 = None + cos_7 = cos_2.unsqueeze(1) + sin_7 = sin_2.unsqueeze(1) + mul_46 = query_states_5 * cos_7 + x1_8 = query_states_5[(Ellipsis, slice(None, 64, None))] + x2_8 = query_states_5[(Ellipsis, slice(64, None, None))] + query_states_5 = None + neg_8 = -x2_8 + x2_8 = None + cat_9 = torch.cat((neg_8, x1_8), dim=-1) + neg_8 = x1_8 = None + mul_47 = cat_9 * sin_7 + cat_9 = None + q_embed_4 = mul_46 + mul_47 + mul_46 = mul_47 = None + mul_48 = key_states_5 * cos_7 + cos_7 = None + x1_9 = key_states_5[(Ellipsis, slice(None, 64, None))] + x2_9 = key_states_5[(Ellipsis, slice(64, None, None))] + key_states_5 = None + neg_9 = -x2_9 + x2_9 = None + cat_10 = torch.cat((neg_9, x1_9), dim=-1) + neg_9 = x1_9 = None + mul_49 = cat_10 * sin_7 + cat_10 = sin_7 = None + k_embed_4 = mul_48 + mul_49 + mul_48 = mul_49 = None + getitem_40 = k_embed_4[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_4 = None + hidden_states_53 = getitem_40.expand(1, 4, 4, 2, 128) + getitem_40 = None + key_10 = hidden_states_53.reshape(1, 16, 2, 128) + hidden_states_53 = None + getitem_41 = value_states_5[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_5 = None + hidden_states_54 = getitem_41.expand(1, 4, 4, 2, 128) + getitem_41 = None + value_10 = hidden_states_54.reshape(1, 16, 2, 128) + hidden_states_54 = None + attention_mask_6 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_5 = q_embed_4.contiguous() + q_embed_4 = None + key_11 = key_10.contiguous() + key_10 = None + value_11 = value_10.contiguous() + value_10 = None + attn_output_20 = torch._C._nn.scaled_dot_product_attention( + query_5, + key_11, + value_11, + attn_mask=attention_mask_6, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_5 = key_11 = value_11 = attention_mask_6 = None + transpose_24 = attn_output_20.transpose(1, 2) + attn_output_20 = None + attn_output_21 = transpose_24.contiguous() + transpose_24 = None + reshape_17 = attn_output_21.reshape(1, 2, -1) + attn_output_21 = None + attn_output_22 = reshape_17.contiguous() + reshape_17 = None + attn_output_23 = torch._C._nn.linear( + attn_output_22, + l_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_22 = l_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_55 = hidden_states_49 + attn_output_23 + hidden_states_49 = attn_output_23 = None + hidden_states_56 = hidden_states_55.to(torch.float32) + pow_12 = hidden_states_56.pow(2) + variance_11 = pow_12.mean(-1, keepdim=True) + pow_12 = None + add_32 = variance_11 + 1e-06 + variance_11 = None + rsqrt_11 = torch.rsqrt(add_32) + add_32 = None + hidden_states_57 = hidden_states_56 * rsqrt_11 + hidden_states_56 = rsqrt_11 = None + to_27 = hidden_states_57.to(torch.bfloat16) + hidden_states_57 = None + hidden_states_58 = ( + l_self_modules_model_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + * to_27 + ) + l_self_modules_model_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = ( + to_27 + ) = None + linear_39 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_5 = torch.nn.functional.silu(linear_39, inplace=False) + linear_39 = None + linear_40 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_58 = l_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_52 = silu_5 * linear_40 + silu_5 = linear_40 = None + down_proj_5 = torch._C._nn.linear( + mul_52, + l_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_52 = l_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_59 = hidden_states_55 + down_proj_5 + hidden_states_55 = down_proj_5 = None + hidden_states_60 = hidden_states_59.to(torch.float32) + pow_13 = hidden_states_60.pow(2) + variance_12 = pow_13.mean(-1, keepdim=True) + pow_13 = None + add_34 = variance_12 + 1e-06 + variance_12 = None + rsqrt_12 = torch.rsqrt(add_34) + add_34 = None + hidden_states_61 = hidden_states_60 * rsqrt_12 + hidden_states_60 = rsqrt_12 = None + to_29 = hidden_states_61.to(torch.bfloat16) + hidden_states_61 = None + hidden_states_62 = ( + l_self_modules_model_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + * to_29 + ) + l_self_modules_model_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + to_29 + ) = None + linear_42 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_19 = linear_42.view((1, 2, -1, 128)) + linear_42 = None + query_states_6 = view_19.transpose(1, 2) + view_19 = None + linear_43 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_20 = linear_43.view((1, 2, -1, 128)) + linear_43 = None + key_states_6 = view_20.transpose(1, 2) + view_20 = None + linear_44 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_62 = l_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_21 = linear_44.view((1, 2, -1, 128)) + linear_44 = None + value_states_6 = view_21.transpose(1, 2) + view_21 = None + cos_8 = cos_2.unsqueeze(1) + sin_8 = sin_2.unsqueeze(1) + mul_55 = query_states_6 * cos_8 + x1_10 = query_states_6[(Ellipsis, slice(None, 64, None))] + x2_10 = query_states_6[(Ellipsis, slice(64, None, None))] + query_states_6 = None + neg_10 = -x2_10 + x2_10 = None + cat_11 = torch.cat((neg_10, x1_10), dim=-1) + neg_10 = x1_10 = None + mul_56 = cat_11 * sin_8 + cat_11 = None + q_embed_5 = mul_55 + mul_56 + mul_55 = mul_56 = None + mul_57 = key_states_6 * cos_8 + cos_8 = None + x1_11 = key_states_6[(Ellipsis, slice(None, 64, None))] + x2_11 = key_states_6[(Ellipsis, slice(64, None, None))] + key_states_6 = None + neg_11 = -x2_11 + x2_11 = None + cat_12 = torch.cat((neg_11, x1_11), dim=-1) + neg_11 = x1_11 = None + mul_58 = cat_12 * sin_8 + cat_12 = sin_8 = None + k_embed_5 = mul_57 + mul_58 + mul_57 = mul_58 = None + getitem_47 = k_embed_5[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_5 = None + hidden_states_63 = getitem_47.expand(1, 4, 4, 2, 128) + getitem_47 = None + key_12 = hidden_states_63.reshape(1, 16, 2, 128) + hidden_states_63 = None + getitem_48 = value_states_6[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_6 = None + hidden_states_64 = getitem_48.expand(1, 4, 4, 2, 128) + getitem_48 = None + value_12 = hidden_states_64.reshape(1, 16, 2, 128) + hidden_states_64 = None + attention_mask_7 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_6 = q_embed_5.contiguous() + q_embed_5 = None + key_13 = key_12.contiguous() + key_12 = None + value_13 = value_12.contiguous() + value_12 = None + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + query_6, + key_13, + value_13, + attn_mask=attention_mask_7, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_6 = key_13 = value_13 = attention_mask_7 = None + transpose_28 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_25 = transpose_28.contiguous() + transpose_28 = None + reshape_20 = attn_output_25.reshape(1, 2, -1) + attn_output_25 = None + attn_output_26 = reshape_20.contiguous() + reshape_20 = None + attn_output_27 = torch._C._nn.linear( + attn_output_26, + l_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_26 = l_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_65 = hidden_states_59 + attn_output_27 + hidden_states_59 = attn_output_27 = None + hidden_states_66 = hidden_states_65.to(torch.float32) + pow_14 = hidden_states_66.pow(2) + variance_13 = pow_14.mean(-1, keepdim=True) + pow_14 = None + add_38 = variance_13 + 1e-06 + variance_13 = None + rsqrt_13 = torch.rsqrt(add_38) + add_38 = None + hidden_states_67 = hidden_states_66 * rsqrt_13 + hidden_states_66 = rsqrt_13 = None + to_31 = hidden_states_67.to(torch.bfloat16) + hidden_states_67 = None + hidden_states_68 = ( + l_self_modules_model_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + * to_31 + ) + l_self_modules_model_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = ( + to_31 + ) = None + linear_46 = torch._C._nn.linear( + hidden_states_68, + l_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_6 = torch.nn.functional.silu(linear_46, inplace=False) + linear_46 = None + linear_47 = torch._C._nn.linear( + hidden_states_68, + l_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_68 = l_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_61 = silu_6 * linear_47 + silu_6 = linear_47 = None + down_proj_6 = torch._C._nn.linear( + mul_61, + l_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_61 = l_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_69 = hidden_states_65 + down_proj_6 + hidden_states_65 = down_proj_6 = None + hidden_states_70 = hidden_states_69.to(torch.float32) + pow_15 = hidden_states_70.pow(2) + variance_14 = pow_15.mean(-1, keepdim=True) + pow_15 = None + add_40 = variance_14 + 1e-06 + variance_14 = None + rsqrt_14 = torch.rsqrt(add_40) + add_40 = None + hidden_states_71 = hidden_states_70 * rsqrt_14 + hidden_states_70 = rsqrt_14 = None + to_33 = hidden_states_71.to(torch.bfloat16) + hidden_states_71 = None + hidden_states_72 = ( + l_self_modules_model_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + * to_33 + ) + l_self_modules_model_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + to_33 + ) = None + linear_49 = torch._C._nn.linear( + hidden_states_72, + l_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_22 = linear_49.view((1, 2, -1, 128)) + linear_49 = None + query_states_7 = view_22.transpose(1, 2) + view_22 = None + linear_50 = torch._C._nn.linear( + hidden_states_72, + l_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_23 = linear_50.view((1, 2, -1, 128)) + linear_50 = None + key_states_7 = view_23.transpose(1, 2) + view_23 = None + linear_51 = torch._C._nn.linear( + hidden_states_72, + l_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_72 = l_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_24 = linear_51.view((1, 2, -1, 128)) + linear_51 = None + value_states_7 = view_24.transpose(1, 2) + view_24 = None + getitem_50 = key_states_7[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + key_states_7 = None + hidden_states_73 = getitem_50.expand(1, 4, 4, 2, 128) + getitem_50 = None + key_14 = hidden_states_73.reshape(1, 16, 2, 128) + hidden_states_73 = None + getitem_51 = value_states_7[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_7 = None + hidden_states_74 = getitem_51.expand(1, 4, 4, 2, 128) + getitem_51 = None + value_14 = hidden_states_74.reshape(1, 16, 2, 128) + hidden_states_74 = None + attention_mask_8 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_7 = query_states_7.contiguous() + query_states_7 = None + key_15 = key_14.contiguous() + key_14 = None + value_15 = value_14.contiguous() + value_14 = None + attn_output_28 = torch._C._nn.scaled_dot_product_attention( + query_7, + key_15, + value_15, + attn_mask=attention_mask_8, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_7 = key_15 = value_15 = attention_mask_8 = None + transpose_32 = attn_output_28.transpose(1, 2) + attn_output_28 = None + attn_output_29 = transpose_32.contiguous() + transpose_32 = None + reshape_23 = attn_output_29.reshape(1, 2, -1) + attn_output_29 = None + attn_output_30 = reshape_23.contiguous() + reshape_23 = None + attn_output_31 = torch._C._nn.linear( + attn_output_30, + l_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_30 = l_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_75 = hidden_states_69 + attn_output_31 + hidden_states_69 = attn_output_31 = None + hidden_states_76 = hidden_states_75.to(torch.float32) + pow_16 = hidden_states_76.pow(2) + variance_15 = pow_16.mean(-1, keepdim=True) + pow_16 = None + add_42 = variance_15 + 1e-06 + variance_15 = None + rsqrt_15 = torch.rsqrt(add_42) + add_42 = None + hidden_states_77 = hidden_states_76 * rsqrt_15 + hidden_states_76 = rsqrt_15 = None + to_35 = hidden_states_77.to(torch.bfloat16) + hidden_states_77 = None + hidden_states_78 = ( + l_self_modules_model_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + * to_35 + ) + l_self_modules_model_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = ( + to_35 + ) = None + linear_53 = torch._C._nn.linear( + hidden_states_78, + l_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_7 = torch.nn.functional.silu(linear_53, inplace=False) + linear_53 = None + linear_54 = torch._C._nn.linear( + hidden_states_78, + l_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_78 = l_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_66 = silu_7 * linear_54 + silu_7 = linear_54 = None + down_proj_7 = torch._C._nn.linear( + mul_66, + l_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_66 = l_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_79 = hidden_states_75 + down_proj_7 + hidden_states_75 = down_proj_7 = None + hidden_states_80 = hidden_states_79.to(torch.float32) + pow_17 = hidden_states_80.pow(2) + variance_16 = pow_17.mean(-1, keepdim=True) + pow_17 = None + add_44 = variance_16 + 1e-06 + variance_16 = None + rsqrt_16 = torch.rsqrt(add_44) + add_44 = None + hidden_states_81 = hidden_states_80 * rsqrt_16 + hidden_states_80 = rsqrt_16 = None + to_37 = hidden_states_81.to(torch.bfloat16) + hidden_states_81 = None + hidden_states_82 = ( + l_self_modules_model_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + * to_37 + ) + l_self_modules_model_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + to_37 + ) = None + linear_56 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_25 = linear_56.view((1, 2, -1, 128)) + linear_56 = None + query_states_8 = view_25.transpose(1, 2) + view_25 = None + linear_57 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_26 = linear_57.view((1, 2, -1, 128)) + linear_57 = None + key_states_8 = view_26.transpose(1, 2) + view_26 = None + linear_58 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_82 = l_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_27 = linear_58.view((1, 2, -1, 128)) + linear_58 = None + value_states_8 = view_27.transpose(1, 2) + view_27 = None + cos_9 = cos_2.unsqueeze(1) + sin_9 = sin_2.unsqueeze(1) + mul_69 = query_states_8 * cos_9 + x1_12 = query_states_8[(Ellipsis, slice(None, 64, None))] + x2_12 = query_states_8[(Ellipsis, slice(64, None, None))] + query_states_8 = None + neg_12 = -x2_12 + x2_12 = None + cat_13 = torch.cat((neg_12, x1_12), dim=-1) + neg_12 = x1_12 = None + mul_70 = cat_13 * sin_9 + cat_13 = None + q_embed_6 = mul_69 + mul_70 + mul_69 = mul_70 = None + mul_71 = key_states_8 * cos_9 + cos_9 = None + x1_13 = key_states_8[(Ellipsis, slice(None, 64, None))] + x2_13 = key_states_8[(Ellipsis, slice(64, None, None))] + key_states_8 = None + neg_13 = -x2_13 + x2_13 = None + cat_14 = torch.cat((neg_13, x1_13), dim=-1) + neg_13 = x1_13 = None + mul_72 = cat_14 * sin_9 + cat_14 = sin_9 = None + k_embed_6 = mul_71 + mul_72 + mul_71 = mul_72 = None + getitem_57 = k_embed_6[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_6 = None + hidden_states_83 = getitem_57.expand(1, 4, 4, 2, 128) + getitem_57 = None + key_16 = hidden_states_83.reshape(1, 16, 2, 128) + hidden_states_83 = None + getitem_58 = value_states_8[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_8 = None + hidden_states_84 = getitem_58.expand(1, 4, 4, 2, 128) + getitem_58 = None + value_16 = hidden_states_84.reshape(1, 16, 2, 128) + hidden_states_84 = None + attention_mask_9 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_8 = q_embed_6.contiguous() + q_embed_6 = None + key_17 = key_16.contiguous() + key_16 = None + value_17 = value_16.contiguous() + value_16 = None + attn_output_32 = torch._C._nn.scaled_dot_product_attention( + query_8, + key_17, + value_17, + attn_mask=attention_mask_9, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_8 = key_17 = value_17 = attention_mask_9 = None + transpose_36 = attn_output_32.transpose(1, 2) + attn_output_32 = None + attn_output_33 = transpose_36.contiguous() + transpose_36 = None + reshape_26 = attn_output_33.reshape(1, 2, -1) + attn_output_33 = None + attn_output_34 = reshape_26.contiguous() + reshape_26 = None + attn_output_35 = torch._C._nn.linear( + attn_output_34, + l_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_34 = l_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_85 = hidden_states_79 + attn_output_35 + hidden_states_79 = attn_output_35 = None + hidden_states_86 = hidden_states_85.to(torch.float32) + pow_18 = hidden_states_86.pow(2) + variance_17 = pow_18.mean(-1, keepdim=True) + pow_18 = None + add_48 = variance_17 + 1e-06 + variance_17 = None + rsqrt_17 = torch.rsqrt(add_48) + add_48 = None + hidden_states_87 = hidden_states_86 * rsqrt_17 + hidden_states_86 = rsqrt_17 = None + to_39 = hidden_states_87.to(torch.bfloat16) + hidden_states_87 = None + hidden_states_88 = ( + l_self_modules_model_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + * to_39 + ) + l_self_modules_model_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = ( + to_39 + ) = None + linear_60 = torch._C._nn.linear( + hidden_states_88, + l_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_8 = torch.nn.functional.silu(linear_60, inplace=False) + linear_60 = None + linear_61 = torch._C._nn.linear( + hidden_states_88, + l_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_88 = l_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_75 = silu_8 * linear_61 + silu_8 = linear_61 = None + down_proj_8 = torch._C._nn.linear( + mul_75, + l_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_75 = l_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_89 = hidden_states_85 + down_proj_8 + hidden_states_85 = down_proj_8 = None + hidden_states_90 = hidden_states_89.to(torch.float32) + pow_19 = hidden_states_90.pow(2) + variance_18 = pow_19.mean(-1, keepdim=True) + pow_19 = None + add_50 = variance_18 + 1e-06 + variance_18 = None + rsqrt_18 = torch.rsqrt(add_50) + add_50 = None + hidden_states_91 = hidden_states_90 * rsqrt_18 + hidden_states_90 = rsqrt_18 = None + to_41 = hidden_states_91.to(torch.bfloat16) + hidden_states_91 = None + hidden_states_92 = ( + l_self_modules_model_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + * to_41 + ) + l_self_modules_model_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + to_41 + ) = None + linear_63 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_28 = linear_63.view((1, 2, -1, 128)) + linear_63 = None + query_states_9 = view_28.transpose(1, 2) + view_28 = None + linear_64 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_29 = linear_64.view((1, 2, -1, 128)) + linear_64 = None + key_states_9 = view_29.transpose(1, 2) + view_29 = None + linear_65 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_92 = l_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_30 = linear_65.view((1, 2, -1, 128)) + linear_65 = None + value_states_9 = view_30.transpose(1, 2) + view_30 = None + cos_10 = cos_2.unsqueeze(1) + sin_10 = sin_2.unsqueeze(1) + mul_78 = query_states_9 * cos_10 + x1_14 = query_states_9[(Ellipsis, slice(None, 64, None))] + x2_14 = query_states_9[(Ellipsis, slice(64, None, None))] + query_states_9 = None + neg_14 = -x2_14 + x2_14 = None + cat_15 = torch.cat((neg_14, x1_14), dim=-1) + neg_14 = x1_14 = None + mul_79 = cat_15 * sin_10 + cat_15 = None + q_embed_7 = mul_78 + mul_79 + mul_78 = mul_79 = None + mul_80 = key_states_9 * cos_10 + cos_10 = None + x1_15 = key_states_9[(Ellipsis, slice(None, 64, None))] + x2_15 = key_states_9[(Ellipsis, slice(64, None, None))] + key_states_9 = None + neg_15 = -x2_15 + x2_15 = None + cat_16 = torch.cat((neg_15, x1_15), dim=-1) + neg_15 = x1_15 = None + mul_81 = cat_16 * sin_10 + cat_16 = sin_10 = None + k_embed_7 = mul_80 + mul_81 + mul_80 = mul_81 = None + getitem_64 = k_embed_7[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_7 = None + hidden_states_93 = getitem_64.expand(1, 4, 4, 2, 128) + getitem_64 = None + key_18 = hidden_states_93.reshape(1, 16, 2, 128) + hidden_states_93 = None + getitem_65 = value_states_9[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_9 = None + hidden_states_94 = getitem_65.expand(1, 4, 4, 2, 128) + getitem_65 = None + value_18 = hidden_states_94.reshape(1, 16, 2, 128) + hidden_states_94 = None + attention_mask_10 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_9 = q_embed_7.contiguous() + q_embed_7 = None + key_19 = key_18.contiguous() + key_18 = None + value_19 = value_18.contiguous() + value_18 = None + attn_output_36 = torch._C._nn.scaled_dot_product_attention( + query_9, + key_19, + value_19, + attn_mask=attention_mask_10, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_9 = key_19 = value_19 = attention_mask_10 = None + transpose_40 = attn_output_36.transpose(1, 2) + attn_output_36 = None + attn_output_37 = transpose_40.contiguous() + transpose_40 = None + reshape_29 = attn_output_37.reshape(1, 2, -1) + attn_output_37 = None + attn_output_38 = reshape_29.contiguous() + reshape_29 = None + attn_output_39 = torch._C._nn.linear( + attn_output_38, + l_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_38 = l_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_95 = hidden_states_89 + attn_output_39 + hidden_states_89 = attn_output_39 = None + hidden_states_96 = hidden_states_95.to(torch.float32) + pow_20 = hidden_states_96.pow(2) + variance_19 = pow_20.mean(-1, keepdim=True) + pow_20 = None + add_54 = variance_19 + 1e-06 + variance_19 = None + rsqrt_19 = torch.rsqrt(add_54) + add_54 = None + hidden_states_97 = hidden_states_96 * rsqrt_19 + hidden_states_96 = rsqrt_19 = None + to_43 = hidden_states_97.to(torch.bfloat16) + hidden_states_97 = None + hidden_states_98 = ( + l_self_modules_model_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + * to_43 + ) + l_self_modules_model_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = ( + to_43 + ) = None + linear_67 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_9 = torch.nn.functional.silu(linear_67, inplace=False) + linear_67 = None + linear_68 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_98 = l_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_84 = silu_9 * linear_68 + silu_9 = linear_68 = None + down_proj_9 = torch._C._nn.linear( + mul_84, + l_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_84 = l_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_99 = hidden_states_95 + down_proj_9 + hidden_states_95 = down_proj_9 = None + hidden_states_100 = hidden_states_99.to(torch.float32) + pow_21 = hidden_states_100.pow(2) + variance_20 = pow_21.mean(-1, keepdim=True) + pow_21 = None + add_56 = variance_20 + 1e-06 + variance_20 = None + rsqrt_20 = torch.rsqrt(add_56) + add_56 = None + hidden_states_101 = hidden_states_100 * rsqrt_20 + hidden_states_100 = rsqrt_20 = None + to_45 = hidden_states_101.to(torch.bfloat16) + hidden_states_101 = None + hidden_states_102 = ( + l_self_modules_model_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + * to_45 + ) + l_self_modules_model_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + to_45 + ) = None + linear_70 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_31 = linear_70.view((1, 2, -1, 128)) + linear_70 = None + query_states_10 = view_31.transpose(1, 2) + view_31 = None + linear_71 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_32 = linear_71.view((1, 2, -1, 128)) + linear_71 = None + key_states_10 = view_32.transpose(1, 2) + view_32 = None + linear_72 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_102 = l_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_33 = linear_72.view((1, 2, -1, 128)) + linear_72 = None + value_states_10 = view_33.transpose(1, 2) + view_33 = None + cos_11 = cos_2.unsqueeze(1) + sin_11 = sin_2.unsqueeze(1) + mul_87 = query_states_10 * cos_11 + x1_16 = query_states_10[(Ellipsis, slice(None, 64, None))] + x2_16 = query_states_10[(Ellipsis, slice(64, None, None))] + query_states_10 = None + neg_16 = -x2_16 + x2_16 = None + cat_17 = torch.cat((neg_16, x1_16), dim=-1) + neg_16 = x1_16 = None + mul_88 = cat_17 * sin_11 + cat_17 = None + q_embed_8 = mul_87 + mul_88 + mul_87 = mul_88 = None + mul_89 = key_states_10 * cos_11 + cos_11 = None + x1_17 = key_states_10[(Ellipsis, slice(None, 64, None))] + x2_17 = key_states_10[(Ellipsis, slice(64, None, None))] + key_states_10 = None + neg_17 = -x2_17 + x2_17 = None + cat_18 = torch.cat((neg_17, x1_17), dim=-1) + neg_17 = x1_17 = None + mul_90 = cat_18 * sin_11 + cat_18 = sin_11 = None + k_embed_8 = mul_89 + mul_90 + mul_89 = mul_90 = None + getitem_71 = k_embed_8[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_8 = None + hidden_states_103 = getitem_71.expand(1, 4, 4, 2, 128) + getitem_71 = None + key_20 = hidden_states_103.reshape(1, 16, 2, 128) + hidden_states_103 = None + getitem_72 = value_states_10[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_10 = None + hidden_states_104 = getitem_72.expand(1, 4, 4, 2, 128) + getitem_72 = None + value_20 = hidden_states_104.reshape(1, 16, 2, 128) + hidden_states_104 = None + attention_mask_11 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_10 = q_embed_8.contiguous() + q_embed_8 = None + key_21 = key_20.contiguous() + key_20 = None + value_21 = value_20.contiguous() + value_20 = None + attn_output_40 = torch._C._nn.scaled_dot_product_attention( + query_10, + key_21, + value_21, + attn_mask=attention_mask_11, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_10 = key_21 = value_21 = attention_mask_11 = None + transpose_44 = attn_output_40.transpose(1, 2) + attn_output_40 = None + attn_output_41 = transpose_44.contiguous() + transpose_44 = None + reshape_32 = attn_output_41.reshape(1, 2, -1) + attn_output_41 = None + attn_output_42 = reshape_32.contiguous() + reshape_32 = None + attn_output_43 = torch._C._nn.linear( + attn_output_42, + l_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_42 = l_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_105 = hidden_states_99 + attn_output_43 + hidden_states_99 = attn_output_43 = None + hidden_states_106 = hidden_states_105.to(torch.float32) + pow_22 = hidden_states_106.pow(2) + variance_21 = pow_22.mean(-1, keepdim=True) + pow_22 = None + add_60 = variance_21 + 1e-06 + variance_21 = None + rsqrt_21 = torch.rsqrt(add_60) + add_60 = None + hidden_states_107 = hidden_states_106 * rsqrt_21 + hidden_states_106 = rsqrt_21 = None + to_47 = hidden_states_107.to(torch.bfloat16) + hidden_states_107 = None + hidden_states_108 = ( + l_self_modules_model_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + * to_47 + ) + l_self_modules_model_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = ( + to_47 + ) = None + linear_74 = torch._C._nn.linear( + hidden_states_108, + l_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_10 = torch.nn.functional.silu(linear_74, inplace=False) + linear_74 = None + linear_75 = torch._C._nn.linear( + hidden_states_108, + l_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_108 = l_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_93 = silu_10 * linear_75 + silu_10 = linear_75 = None + down_proj_10 = torch._C._nn.linear( + mul_93, + l_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_93 = l_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_109 = hidden_states_105 + down_proj_10 + hidden_states_105 = down_proj_10 = None + hidden_states_110 = hidden_states_109.to(torch.float32) + pow_23 = hidden_states_110.pow(2) + variance_22 = pow_23.mean(-1, keepdim=True) + pow_23 = None + add_62 = variance_22 + 1e-06 + variance_22 = None + rsqrt_22 = torch.rsqrt(add_62) + add_62 = None + hidden_states_111 = hidden_states_110 * rsqrt_22 + hidden_states_110 = rsqrt_22 = None + to_49 = hidden_states_111.to(torch.bfloat16) + hidden_states_111 = None + hidden_states_112 = ( + l_self_modules_model_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + * to_49 + ) + l_self_modules_model_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + to_49 + ) = None + linear_77 = torch._C._nn.linear( + hidden_states_112, + l_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_34 = linear_77.view((1, 2, -1, 128)) + linear_77 = None + query_states_11 = view_34.transpose(1, 2) + view_34 = None + linear_78 = torch._C._nn.linear( + hidden_states_112, + l_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_35 = linear_78.view((1, 2, -1, 128)) + linear_78 = None + key_states_11 = view_35.transpose(1, 2) + view_35 = None + linear_79 = torch._C._nn.linear( + hidden_states_112, + l_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_112 = l_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_36 = linear_79.view((1, 2, -1, 128)) + linear_79 = None + value_states_11 = view_36.transpose(1, 2) + view_36 = None + getitem_74 = key_states_11[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + key_states_11 = None + hidden_states_113 = getitem_74.expand(1, 4, 4, 2, 128) + getitem_74 = None + key_22 = hidden_states_113.reshape(1, 16, 2, 128) + hidden_states_113 = None + getitem_75 = value_states_11[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_11 = None + hidden_states_114 = getitem_75.expand(1, 4, 4, 2, 128) + getitem_75 = None + value_22 = hidden_states_114.reshape(1, 16, 2, 128) + hidden_states_114 = None + attention_mask_12 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_11 = query_states_11.contiguous() + query_states_11 = None + key_23 = key_22.contiguous() + key_22 = None + value_23 = value_22.contiguous() + value_22 = None + attn_output_44 = torch._C._nn.scaled_dot_product_attention( + query_11, + key_23, + value_23, + attn_mask=attention_mask_12, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_11 = key_23 = value_23 = attention_mask_12 = None + transpose_48 = attn_output_44.transpose(1, 2) + attn_output_44 = None + attn_output_45 = transpose_48.contiguous() + transpose_48 = None + reshape_35 = attn_output_45.reshape(1, 2, -1) + attn_output_45 = None + attn_output_46 = reshape_35.contiguous() + reshape_35 = None + attn_output_47 = torch._C._nn.linear( + attn_output_46, + l_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_46 = l_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_115 = hidden_states_109 + attn_output_47 + hidden_states_109 = attn_output_47 = None + hidden_states_116 = hidden_states_115.to(torch.float32) + pow_24 = hidden_states_116.pow(2) + variance_23 = pow_24.mean(-1, keepdim=True) + pow_24 = None + add_64 = variance_23 + 1e-06 + variance_23 = None + rsqrt_23 = torch.rsqrt(add_64) + add_64 = None + hidden_states_117 = hidden_states_116 * rsqrt_23 + hidden_states_116 = rsqrt_23 = None + to_51 = hidden_states_117.to(torch.bfloat16) + hidden_states_117 = None + hidden_states_118 = ( + l_self_modules_model_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + * to_51 + ) + l_self_modules_model_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = ( + to_51 + ) = None + linear_81 = torch._C._nn.linear( + hidden_states_118, + l_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_11 = torch.nn.functional.silu(linear_81, inplace=False) + linear_81 = None + linear_82 = torch._C._nn.linear( + hidden_states_118, + l_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_118 = l_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_98 = silu_11 * linear_82 + silu_11 = linear_82 = None + down_proj_11 = torch._C._nn.linear( + mul_98, + l_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_98 = l_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_119 = hidden_states_115 + down_proj_11 + hidden_states_115 = down_proj_11 = None + hidden_states_120 = hidden_states_119.to(torch.float32) + pow_25 = hidden_states_120.pow(2) + variance_24 = pow_25.mean(-1, keepdim=True) + pow_25 = None + add_66 = variance_24 + 1e-06 + variance_24 = None + rsqrt_24 = torch.rsqrt(add_66) + add_66 = None + hidden_states_121 = hidden_states_120 * rsqrt_24 + hidden_states_120 = rsqrt_24 = None + to_53 = hidden_states_121.to(torch.bfloat16) + hidden_states_121 = None + hidden_states_122 = ( + l_self_modules_model_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + * to_53 + ) + l_self_modules_model_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + to_53 + ) = None + linear_84 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_37 = linear_84.view((1, 2, -1, 128)) + linear_84 = None + query_states_12 = view_37.transpose(1, 2) + view_37 = None + linear_85 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_38 = linear_85.view((1, 2, -1, 128)) + linear_85 = None + key_states_12 = view_38.transpose(1, 2) + view_38 = None + linear_86 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_122 = l_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_39 = linear_86.view((1, 2, -1, 128)) + linear_86 = None + value_states_12 = view_39.transpose(1, 2) + view_39 = None + cos_12 = cos_2.unsqueeze(1) + sin_12 = sin_2.unsqueeze(1) + mul_101 = query_states_12 * cos_12 + x1_18 = query_states_12[(Ellipsis, slice(None, 64, None))] + x2_18 = query_states_12[(Ellipsis, slice(64, None, None))] + query_states_12 = None + neg_18 = -x2_18 + x2_18 = None + cat_19 = torch.cat((neg_18, x1_18), dim=-1) + neg_18 = x1_18 = None + mul_102 = cat_19 * sin_12 + cat_19 = None + q_embed_9 = mul_101 + mul_102 + mul_101 = mul_102 = None + mul_103 = key_states_12 * cos_12 + cos_12 = None + x1_19 = key_states_12[(Ellipsis, slice(None, 64, None))] + x2_19 = key_states_12[(Ellipsis, slice(64, None, None))] + key_states_12 = None + neg_19 = -x2_19 + x2_19 = None + cat_20 = torch.cat((neg_19, x1_19), dim=-1) + neg_19 = x1_19 = None + mul_104 = cat_20 * sin_12 + cat_20 = sin_12 = None + k_embed_9 = mul_103 + mul_104 + mul_103 = mul_104 = None + getitem_81 = k_embed_9[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_9 = None + hidden_states_123 = getitem_81.expand(1, 4, 4, 2, 128) + getitem_81 = None + key_24 = hidden_states_123.reshape(1, 16, 2, 128) + hidden_states_123 = None + getitem_82 = value_states_12[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_12 = None + hidden_states_124 = getitem_82.expand(1, 4, 4, 2, 128) + getitem_82 = None + value_24 = hidden_states_124.reshape(1, 16, 2, 128) + hidden_states_124 = None + attention_mask_13 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_12 = q_embed_9.contiguous() + q_embed_9 = None + key_25 = key_24.contiguous() + key_24 = None + value_25 = value_24.contiguous() + value_24 = None + attn_output_48 = torch._C._nn.scaled_dot_product_attention( + query_12, + key_25, + value_25, + attn_mask=attention_mask_13, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_12 = key_25 = value_25 = attention_mask_13 = None + transpose_52 = attn_output_48.transpose(1, 2) + attn_output_48 = None + attn_output_49 = transpose_52.contiguous() + transpose_52 = None + reshape_38 = attn_output_49.reshape(1, 2, -1) + attn_output_49 = None + attn_output_50 = reshape_38.contiguous() + reshape_38 = None + attn_output_51 = torch._C._nn.linear( + attn_output_50, + l_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_50 = l_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_125 = hidden_states_119 + attn_output_51 + hidden_states_119 = attn_output_51 = None + hidden_states_126 = hidden_states_125.to(torch.float32) + pow_26 = hidden_states_126.pow(2) + variance_25 = pow_26.mean(-1, keepdim=True) + pow_26 = None + add_70 = variance_25 + 1e-06 + variance_25 = None + rsqrt_25 = torch.rsqrt(add_70) + add_70 = None + hidden_states_127 = hidden_states_126 * rsqrt_25 + hidden_states_126 = rsqrt_25 = None + to_55 = hidden_states_127.to(torch.bfloat16) + hidden_states_127 = None + hidden_states_128 = ( + l_self_modules_model_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + * to_55 + ) + l_self_modules_model_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = ( + to_55 + ) = None + linear_88 = torch._C._nn.linear( + hidden_states_128, + l_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_12 = torch.nn.functional.silu(linear_88, inplace=False) + linear_88 = None + linear_89 = torch._C._nn.linear( + hidden_states_128, + l_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_128 = l_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_107 = silu_12 * linear_89 + silu_12 = linear_89 = None + down_proj_12 = torch._C._nn.linear( + mul_107, + l_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_107 = l_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_129 = hidden_states_125 + down_proj_12 + hidden_states_125 = down_proj_12 = None + hidden_states_130 = hidden_states_129.to(torch.float32) + pow_27 = hidden_states_130.pow(2) + variance_26 = pow_27.mean(-1, keepdim=True) + pow_27 = None + add_72 = variance_26 + 1e-06 + variance_26 = None + rsqrt_26 = torch.rsqrt(add_72) + add_72 = None + hidden_states_131 = hidden_states_130 * rsqrt_26 + hidden_states_130 = rsqrt_26 = None + to_57 = hidden_states_131.to(torch.bfloat16) + hidden_states_131 = None + hidden_states_132 = ( + l_self_modules_model_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + * to_57 + ) + l_self_modules_model_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + to_57 + ) = None + linear_91 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_40 = linear_91.view((1, 2, -1, 128)) + linear_91 = None + query_states_13 = view_40.transpose(1, 2) + view_40 = None + linear_92 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_41 = linear_92.view((1, 2, -1, 128)) + linear_92 = None + key_states_13 = view_41.transpose(1, 2) + view_41 = None + linear_93 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_132 = l_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_42 = linear_93.view((1, 2, -1, 128)) + linear_93 = None + value_states_13 = view_42.transpose(1, 2) + view_42 = None + cos_13 = cos_2.unsqueeze(1) + sin_13 = sin_2.unsqueeze(1) + mul_110 = query_states_13 * cos_13 + x1_20 = query_states_13[(Ellipsis, slice(None, 64, None))] + x2_20 = query_states_13[(Ellipsis, slice(64, None, None))] + query_states_13 = None + neg_20 = -x2_20 + x2_20 = None + cat_21 = torch.cat((neg_20, x1_20), dim=-1) + neg_20 = x1_20 = None + mul_111 = cat_21 * sin_13 + cat_21 = None + q_embed_10 = mul_110 + mul_111 + mul_110 = mul_111 = None + mul_112 = key_states_13 * cos_13 + cos_13 = None + x1_21 = key_states_13[(Ellipsis, slice(None, 64, None))] + x2_21 = key_states_13[(Ellipsis, slice(64, None, None))] + key_states_13 = None + neg_21 = -x2_21 + x2_21 = None + cat_22 = torch.cat((neg_21, x1_21), dim=-1) + neg_21 = x1_21 = None + mul_113 = cat_22 * sin_13 + cat_22 = sin_13 = None + k_embed_10 = mul_112 + mul_113 + mul_112 = mul_113 = None + getitem_88 = k_embed_10[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_10 = None + hidden_states_133 = getitem_88.expand(1, 4, 4, 2, 128) + getitem_88 = None + key_26 = hidden_states_133.reshape(1, 16, 2, 128) + hidden_states_133 = None + getitem_89 = value_states_13[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_13 = None + hidden_states_134 = getitem_89.expand(1, 4, 4, 2, 128) + getitem_89 = None + value_26 = hidden_states_134.reshape(1, 16, 2, 128) + hidden_states_134 = None + attention_mask_14 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_13 = q_embed_10.contiguous() + q_embed_10 = None + key_27 = key_26.contiguous() + key_26 = None + value_27 = value_26.contiguous() + value_26 = None + attn_output_52 = torch._C._nn.scaled_dot_product_attention( + query_13, + key_27, + value_27, + attn_mask=attention_mask_14, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_13 = key_27 = value_27 = attention_mask_14 = None + transpose_56 = attn_output_52.transpose(1, 2) + attn_output_52 = None + attn_output_53 = transpose_56.contiguous() + transpose_56 = None + reshape_41 = attn_output_53.reshape(1, 2, -1) + attn_output_53 = None + attn_output_54 = reshape_41.contiguous() + reshape_41 = None + attn_output_55 = torch._C._nn.linear( + attn_output_54, + l_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_54 = l_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_135 = hidden_states_129 + attn_output_55 + hidden_states_129 = attn_output_55 = None + hidden_states_136 = hidden_states_135.to(torch.float32) + pow_28 = hidden_states_136.pow(2) + variance_27 = pow_28.mean(-1, keepdim=True) + pow_28 = None + add_76 = variance_27 + 1e-06 + variance_27 = None + rsqrt_27 = torch.rsqrt(add_76) + add_76 = None + hidden_states_137 = hidden_states_136 * rsqrt_27 + hidden_states_136 = rsqrt_27 = None + to_59 = hidden_states_137.to(torch.bfloat16) + hidden_states_137 = None + hidden_states_138 = ( + l_self_modules_model_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + * to_59 + ) + l_self_modules_model_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = ( + to_59 + ) = None + linear_95 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_13 = torch.nn.functional.silu(linear_95, inplace=False) + linear_95 = None + linear_96 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_138 = l_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_116 = silu_13 * linear_96 + silu_13 = linear_96 = None + down_proj_13 = torch._C._nn.linear( + mul_116, + l_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_116 = l_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_139 = hidden_states_135 + down_proj_13 + hidden_states_135 = down_proj_13 = None + hidden_states_140 = hidden_states_139.to(torch.float32) + pow_29 = hidden_states_140.pow(2) + variance_28 = pow_29.mean(-1, keepdim=True) + pow_29 = None + add_78 = variance_28 + 1e-06 + variance_28 = None + rsqrt_28 = torch.rsqrt(add_78) + add_78 = None + hidden_states_141 = hidden_states_140 * rsqrt_28 + hidden_states_140 = rsqrt_28 = None + to_61 = hidden_states_141.to(torch.bfloat16) + hidden_states_141 = None + hidden_states_142 = ( + l_self_modules_model_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + * to_61 + ) + l_self_modules_model_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + to_61 + ) = None + linear_98 = torch._C._nn.linear( + hidden_states_142, + l_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_43 = linear_98.view((1, 2, -1, 128)) + linear_98 = None + query_states_14 = view_43.transpose(1, 2) + view_43 = None + linear_99 = torch._C._nn.linear( + hidden_states_142, + l_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_44 = linear_99.view((1, 2, -1, 128)) + linear_99 = None + key_states_14 = view_44.transpose(1, 2) + view_44 = None + linear_100 = torch._C._nn.linear( + hidden_states_142, + l_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_142 = l_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_45 = linear_100.view((1, 2, -1, 128)) + linear_100 = None + value_states_14 = view_45.transpose(1, 2) + view_45 = None + cos_14 = cos_2.unsqueeze(1) + sin_14 = sin_2.unsqueeze(1) + mul_119 = query_states_14 * cos_14 + x1_22 = query_states_14[(Ellipsis, slice(None, 64, None))] + x2_22 = query_states_14[(Ellipsis, slice(64, None, None))] + query_states_14 = None + neg_22 = -x2_22 + x2_22 = None + cat_23 = torch.cat((neg_22, x1_22), dim=-1) + neg_22 = x1_22 = None + mul_120 = cat_23 * sin_14 + cat_23 = None + q_embed_11 = mul_119 + mul_120 + mul_119 = mul_120 = None + mul_121 = key_states_14 * cos_14 + cos_14 = None + x1_23 = key_states_14[(Ellipsis, slice(None, 64, None))] + x2_23 = key_states_14[(Ellipsis, slice(64, None, None))] + key_states_14 = None + neg_23 = -x2_23 + x2_23 = None + cat_24 = torch.cat((neg_23, x1_23), dim=-1) + neg_23 = x1_23 = None + mul_122 = cat_24 * sin_14 + cat_24 = sin_14 = None + k_embed_11 = mul_121 + mul_122 + mul_121 = mul_122 = None + getitem_95 = k_embed_11[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_11 = None + hidden_states_143 = getitem_95.expand(1, 4, 4, 2, 128) + getitem_95 = None + key_28 = hidden_states_143.reshape(1, 16, 2, 128) + hidden_states_143 = None + getitem_96 = value_states_14[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_14 = None + hidden_states_144 = getitem_96.expand(1, 4, 4, 2, 128) + getitem_96 = None + value_28 = hidden_states_144.reshape(1, 16, 2, 128) + hidden_states_144 = None + attention_mask_15 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_14 = q_embed_11.contiguous() + q_embed_11 = None + key_29 = key_28.contiguous() + key_28 = None + value_29 = value_28.contiguous() + value_28 = None + attn_output_56 = torch._C._nn.scaled_dot_product_attention( + query_14, + key_29, + value_29, + attn_mask=attention_mask_15, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_14 = key_29 = value_29 = attention_mask_15 = None + transpose_60 = attn_output_56.transpose(1, 2) + attn_output_56 = None + attn_output_57 = transpose_60.contiguous() + transpose_60 = None + reshape_44 = attn_output_57.reshape(1, 2, -1) + attn_output_57 = None + attn_output_58 = reshape_44.contiguous() + reshape_44 = None + attn_output_59 = torch._C._nn.linear( + attn_output_58, + l_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_58 = l_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_145 = hidden_states_139 + attn_output_59 + hidden_states_139 = attn_output_59 = None + hidden_states_146 = hidden_states_145.to(torch.float32) + pow_30 = hidden_states_146.pow(2) + variance_29 = pow_30.mean(-1, keepdim=True) + pow_30 = None + add_82 = variance_29 + 1e-06 + variance_29 = None + rsqrt_29 = torch.rsqrt(add_82) + add_82 = None + hidden_states_147 = hidden_states_146 * rsqrt_29 + hidden_states_146 = rsqrt_29 = None + to_63 = hidden_states_147.to(torch.bfloat16) + hidden_states_147 = None + hidden_states_148 = ( + l_self_modules_model_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + * to_63 + ) + l_self_modules_model_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = ( + to_63 + ) = None + linear_102 = torch._C._nn.linear( + hidden_states_148, + l_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_14 = torch.nn.functional.silu(linear_102, inplace=False) + linear_102 = None + linear_103 = torch._C._nn.linear( + hidden_states_148, + l_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_148 = l_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_125 = silu_14 * linear_103 + silu_14 = linear_103 = None + down_proj_14 = torch._C._nn.linear( + mul_125, + l_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_125 = l_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_149 = hidden_states_145 + down_proj_14 + hidden_states_145 = down_proj_14 = None + hidden_states_150 = hidden_states_149.to(torch.float32) + pow_31 = hidden_states_150.pow(2) + variance_30 = pow_31.mean(-1, keepdim=True) + pow_31 = None + add_84 = variance_30 + 1e-06 + variance_30 = None + rsqrt_30 = torch.rsqrt(add_84) + add_84 = None + hidden_states_151 = hidden_states_150 * rsqrt_30 + hidden_states_150 = rsqrt_30 = None + to_65 = hidden_states_151.to(torch.bfloat16) + hidden_states_151 = None + hidden_states_152 = ( + l_self_modules_model_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + * to_65 + ) + l_self_modules_model_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + to_65 + ) = None + linear_105 = torch._C._nn.linear( + hidden_states_152, + l_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_46 = linear_105.view((1, 2, -1, 128)) + linear_105 = None + query_states_15 = view_46.transpose(1, 2) + view_46 = None + linear_106 = torch._C._nn.linear( + hidden_states_152, + l_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_47 = linear_106.view((1, 2, -1, 128)) + linear_106 = None + key_states_15 = view_47.transpose(1, 2) + view_47 = None + linear_107 = torch._C._nn.linear( + hidden_states_152, + l_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_152 = l_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_48 = linear_107.view((1, 2, -1, 128)) + linear_107 = None + value_states_15 = view_48.transpose(1, 2) + view_48 = None + getitem_98 = key_states_15[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + key_states_15 = None + hidden_states_153 = getitem_98.expand(1, 4, 4, 2, 128) + getitem_98 = None + key_30 = hidden_states_153.reshape(1, 16, 2, 128) + hidden_states_153 = None + getitem_99 = value_states_15[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_15 = None + hidden_states_154 = getitem_99.expand(1, 4, 4, 2, 128) + getitem_99 = None + value_30 = hidden_states_154.reshape(1, 16, 2, 128) + hidden_states_154 = None + attention_mask_16 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_15 = query_states_15.contiguous() + query_states_15 = None + key_31 = key_30.contiguous() + key_30 = None + value_31 = value_30.contiguous() + value_30 = None + attn_output_60 = torch._C._nn.scaled_dot_product_attention( + query_15, + key_31, + value_31, + attn_mask=attention_mask_16, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_15 = key_31 = value_31 = attention_mask_16 = None + transpose_64 = attn_output_60.transpose(1, 2) + attn_output_60 = None + attn_output_61 = transpose_64.contiguous() + transpose_64 = None + reshape_47 = attn_output_61.reshape(1, 2, -1) + attn_output_61 = None + attn_output_62 = reshape_47.contiguous() + reshape_47 = None + attn_output_63 = torch._C._nn.linear( + attn_output_62, + l_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_62 = l_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_155 = hidden_states_149 + attn_output_63 + hidden_states_149 = attn_output_63 = None + hidden_states_156 = hidden_states_155.to(torch.float32) + pow_32 = hidden_states_156.pow(2) + variance_31 = pow_32.mean(-1, keepdim=True) + pow_32 = None + add_86 = variance_31 + 1e-06 + variance_31 = None + rsqrt_31 = torch.rsqrt(add_86) + add_86 = None + hidden_states_157 = hidden_states_156 * rsqrt_31 + hidden_states_156 = rsqrt_31 = None + to_67 = hidden_states_157.to(torch.bfloat16) + hidden_states_157 = None + hidden_states_158 = ( + l_self_modules_model_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + * to_67 + ) + l_self_modules_model_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = ( + to_67 + ) = None + linear_109 = torch._C._nn.linear( + hidden_states_158, + l_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_15 = torch.nn.functional.silu(linear_109, inplace=False) + linear_109 = None + linear_110 = torch._C._nn.linear( + hidden_states_158, + l_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_158 = l_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_130 = silu_15 * linear_110 + silu_15 = linear_110 = None + down_proj_15 = torch._C._nn.linear( + mul_130, + l_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_130 = l_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_159 = hidden_states_155 + down_proj_15 + hidden_states_155 = down_proj_15 = None + hidden_states_160 = hidden_states_159.to(torch.float32) + pow_33 = hidden_states_160.pow(2) + variance_32 = pow_33.mean(-1, keepdim=True) + pow_33 = None + add_88 = variance_32 + 1e-06 + variance_32 = None + rsqrt_32 = torch.rsqrt(add_88) + add_88 = None + hidden_states_161 = hidden_states_160 * rsqrt_32 + hidden_states_160 = rsqrt_32 = None + to_69 = hidden_states_161.to(torch.bfloat16) + hidden_states_161 = None + hidden_states_162 = ( + l_self_modules_model_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + * to_69 + ) + l_self_modules_model_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + to_69 + ) = None + linear_112 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_49 = linear_112.view((1, 2, -1, 128)) + linear_112 = None + query_states_16 = view_49.transpose(1, 2) + view_49 = None + linear_113 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_50 = linear_113.view((1, 2, -1, 128)) + linear_113 = None + key_states_16 = view_50.transpose(1, 2) + view_50 = None + linear_114 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_162 = l_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_51 = linear_114.view((1, 2, -1, 128)) + linear_114 = None + value_states_16 = view_51.transpose(1, 2) + view_51 = None + cos_15 = cos_2.unsqueeze(1) + sin_15 = sin_2.unsqueeze(1) + mul_133 = query_states_16 * cos_15 + x1_24 = query_states_16[(Ellipsis, slice(None, 64, None))] + x2_24 = query_states_16[(Ellipsis, slice(64, None, None))] + query_states_16 = None + neg_24 = -x2_24 + x2_24 = None + cat_25 = torch.cat((neg_24, x1_24), dim=-1) + neg_24 = x1_24 = None + mul_134 = cat_25 * sin_15 + cat_25 = None + q_embed_12 = mul_133 + mul_134 + mul_133 = mul_134 = None + mul_135 = key_states_16 * cos_15 + cos_15 = None + x1_25 = key_states_16[(Ellipsis, slice(None, 64, None))] + x2_25 = key_states_16[(Ellipsis, slice(64, None, None))] + key_states_16 = None + neg_25 = -x2_25 + x2_25 = None + cat_26 = torch.cat((neg_25, x1_25), dim=-1) + neg_25 = x1_25 = None + mul_136 = cat_26 * sin_15 + cat_26 = sin_15 = None + k_embed_12 = mul_135 + mul_136 + mul_135 = mul_136 = None + getitem_105 = k_embed_12[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_12 = None + hidden_states_163 = getitem_105.expand(1, 4, 4, 2, 128) + getitem_105 = None + key_32 = hidden_states_163.reshape(1, 16, 2, 128) + hidden_states_163 = None + getitem_106 = value_states_16[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_16 = None + hidden_states_164 = getitem_106.expand(1, 4, 4, 2, 128) + getitem_106 = None + value_32 = hidden_states_164.reshape(1, 16, 2, 128) + hidden_states_164 = None + attention_mask_17 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_16 = q_embed_12.contiguous() + q_embed_12 = None + key_33 = key_32.contiguous() + key_32 = None + value_33 = value_32.contiguous() + value_32 = None + attn_output_64 = torch._C._nn.scaled_dot_product_attention( + query_16, + key_33, + value_33, + attn_mask=attention_mask_17, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_16 = key_33 = value_33 = attention_mask_17 = None + transpose_68 = attn_output_64.transpose(1, 2) + attn_output_64 = None + attn_output_65 = transpose_68.contiguous() + transpose_68 = None + reshape_50 = attn_output_65.reshape(1, 2, -1) + attn_output_65 = None + attn_output_66 = reshape_50.contiguous() + reshape_50 = None + attn_output_67 = torch._C._nn.linear( + attn_output_66, + l_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_66 = l_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_165 = hidden_states_159 + attn_output_67 + hidden_states_159 = attn_output_67 = None + hidden_states_166 = hidden_states_165.to(torch.float32) + pow_34 = hidden_states_166.pow(2) + variance_33 = pow_34.mean(-1, keepdim=True) + pow_34 = None + add_92 = variance_33 + 1e-06 + variance_33 = None + rsqrt_33 = torch.rsqrt(add_92) + add_92 = None + hidden_states_167 = hidden_states_166 * rsqrt_33 + hidden_states_166 = rsqrt_33 = None + to_71 = hidden_states_167.to(torch.bfloat16) + hidden_states_167 = None + hidden_states_168 = ( + l_self_modules_model_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + * to_71 + ) + l_self_modules_model_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = ( + to_71 + ) = None + linear_116 = torch._C._nn.linear( + hidden_states_168, + l_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_16 = torch.nn.functional.silu(linear_116, inplace=False) + linear_116 = None + linear_117 = torch._C._nn.linear( + hidden_states_168, + l_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_168 = l_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_139 = silu_16 * linear_117 + silu_16 = linear_117 = None + down_proj_16 = torch._C._nn.linear( + mul_139, + l_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_139 = l_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_169 = hidden_states_165 + down_proj_16 + hidden_states_165 = down_proj_16 = None + hidden_states_170 = hidden_states_169.to(torch.float32) + pow_35 = hidden_states_170.pow(2) + variance_34 = pow_35.mean(-1, keepdim=True) + pow_35 = None + add_94 = variance_34 + 1e-06 + variance_34 = None + rsqrt_34 = torch.rsqrt(add_94) + add_94 = None + hidden_states_171 = hidden_states_170 * rsqrt_34 + hidden_states_170 = rsqrt_34 = None + to_73 = hidden_states_171.to(torch.bfloat16) + hidden_states_171 = None + hidden_states_172 = ( + l_self_modules_model_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + * to_73 + ) + l_self_modules_model_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + to_73 + ) = None + linear_119 = torch._C._nn.linear( + hidden_states_172, + l_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_52 = linear_119.view((1, 2, -1, 128)) + linear_119 = None + query_states_17 = view_52.transpose(1, 2) + view_52 = None + linear_120 = torch._C._nn.linear( + hidden_states_172, + l_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_53 = linear_120.view((1, 2, -1, 128)) + linear_120 = None + key_states_17 = view_53.transpose(1, 2) + view_53 = None + linear_121 = torch._C._nn.linear( + hidden_states_172, + l_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_172 = l_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_54 = linear_121.view((1, 2, -1, 128)) + linear_121 = None + value_states_17 = view_54.transpose(1, 2) + view_54 = None + cos_16 = cos_2.unsqueeze(1) + sin_16 = sin_2.unsqueeze(1) + mul_142 = query_states_17 * cos_16 + x1_26 = query_states_17[(Ellipsis, slice(None, 64, None))] + x2_26 = query_states_17[(Ellipsis, slice(64, None, None))] + query_states_17 = None + neg_26 = -x2_26 + x2_26 = None + cat_27 = torch.cat((neg_26, x1_26), dim=-1) + neg_26 = x1_26 = None + mul_143 = cat_27 * sin_16 + cat_27 = None + q_embed_13 = mul_142 + mul_143 + mul_142 = mul_143 = None + mul_144 = key_states_17 * cos_16 + cos_16 = None + x1_27 = key_states_17[(Ellipsis, slice(None, 64, None))] + x2_27 = key_states_17[(Ellipsis, slice(64, None, None))] + key_states_17 = None + neg_27 = -x2_27 + x2_27 = None + cat_28 = torch.cat((neg_27, x1_27), dim=-1) + neg_27 = x1_27 = None + mul_145 = cat_28 * sin_16 + cat_28 = sin_16 = None + k_embed_13 = mul_144 + mul_145 + mul_144 = mul_145 = None + getitem_112 = k_embed_13[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_13 = None + hidden_states_173 = getitem_112.expand(1, 4, 4, 2, 128) + getitem_112 = None + key_34 = hidden_states_173.reshape(1, 16, 2, 128) + hidden_states_173 = None + getitem_113 = value_states_17[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_17 = None + hidden_states_174 = getitem_113.expand(1, 4, 4, 2, 128) + getitem_113 = None + value_34 = hidden_states_174.reshape(1, 16, 2, 128) + hidden_states_174 = None + attention_mask_18 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_17 = q_embed_13.contiguous() + q_embed_13 = None + key_35 = key_34.contiguous() + key_34 = None + value_35 = value_34.contiguous() + value_34 = None + attn_output_68 = torch._C._nn.scaled_dot_product_attention( + query_17, + key_35, + value_35, + attn_mask=attention_mask_18, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_17 = key_35 = value_35 = attention_mask_18 = None + transpose_72 = attn_output_68.transpose(1, 2) + attn_output_68 = None + attn_output_69 = transpose_72.contiguous() + transpose_72 = None + reshape_53 = attn_output_69.reshape(1, 2, -1) + attn_output_69 = None + attn_output_70 = reshape_53.contiguous() + reshape_53 = None + attn_output_71 = torch._C._nn.linear( + attn_output_70, + l_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_70 = l_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_175 = hidden_states_169 + attn_output_71 + hidden_states_169 = attn_output_71 = None + hidden_states_176 = hidden_states_175.to(torch.float32) + pow_36 = hidden_states_176.pow(2) + variance_35 = pow_36.mean(-1, keepdim=True) + pow_36 = None + add_98 = variance_35 + 1e-06 + variance_35 = None + rsqrt_35 = torch.rsqrt(add_98) + add_98 = None + hidden_states_177 = hidden_states_176 * rsqrt_35 + hidden_states_176 = rsqrt_35 = None + to_75 = hidden_states_177.to(torch.bfloat16) + hidden_states_177 = None + hidden_states_178 = ( + l_self_modules_model_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + * to_75 + ) + l_self_modules_model_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = ( + to_75 + ) = None + linear_123 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_17 = torch.nn.functional.silu(linear_123, inplace=False) + linear_123 = None + linear_124 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_178 = l_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_148 = silu_17 * linear_124 + silu_17 = linear_124 = None + down_proj_17 = torch._C._nn.linear( + mul_148, + l_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_148 = l_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_179 = hidden_states_175 + down_proj_17 + hidden_states_175 = down_proj_17 = None + hidden_states_180 = hidden_states_179.to(torch.float32) + pow_37 = hidden_states_180.pow(2) + variance_36 = pow_37.mean(-1, keepdim=True) + pow_37 = None + add_100 = variance_36 + 1e-06 + variance_36 = None + rsqrt_36 = torch.rsqrt(add_100) + add_100 = None + hidden_states_181 = hidden_states_180 * rsqrt_36 + hidden_states_180 = rsqrt_36 = None + to_77 = hidden_states_181.to(torch.bfloat16) + hidden_states_181 = None + hidden_states_182 = ( + l_self_modules_model_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + * to_77 + ) + l_self_modules_model_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + to_77 + ) = None + linear_126 = torch._C._nn.linear( + hidden_states_182, + l_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_55 = linear_126.view((1, 2, -1, 128)) + linear_126 = None + query_states_18 = view_55.transpose(1, 2) + view_55 = None + linear_127 = torch._C._nn.linear( + hidden_states_182, + l_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_56 = linear_127.view((1, 2, -1, 128)) + linear_127 = None + key_states_18 = view_56.transpose(1, 2) + view_56 = None + linear_128 = torch._C._nn.linear( + hidden_states_182, + l_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_182 = l_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_57 = linear_128.view((1, 2, -1, 128)) + linear_128 = None + value_states_18 = view_57.transpose(1, 2) + view_57 = None + cos_17 = cos_2.unsqueeze(1) + sin_17 = sin_2.unsqueeze(1) + mul_151 = query_states_18 * cos_17 + x1_28 = query_states_18[(Ellipsis, slice(None, 64, None))] + x2_28 = query_states_18[(Ellipsis, slice(64, None, None))] + query_states_18 = None + neg_28 = -x2_28 + x2_28 = None + cat_29 = torch.cat((neg_28, x1_28), dim=-1) + neg_28 = x1_28 = None + mul_152 = cat_29 * sin_17 + cat_29 = None + q_embed_14 = mul_151 + mul_152 + mul_151 = mul_152 = None + mul_153 = key_states_18 * cos_17 + cos_17 = None + x1_29 = key_states_18[(Ellipsis, slice(None, 64, None))] + x2_29 = key_states_18[(Ellipsis, slice(64, None, None))] + key_states_18 = None + neg_29 = -x2_29 + x2_29 = None + cat_30 = torch.cat((neg_29, x1_29), dim=-1) + neg_29 = x1_29 = None + mul_154 = cat_30 * sin_17 + cat_30 = sin_17 = None + k_embed_14 = mul_153 + mul_154 + mul_153 = mul_154 = None + getitem_119 = k_embed_14[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_14 = None + hidden_states_183 = getitem_119.expand(1, 4, 4, 2, 128) + getitem_119 = None + key_36 = hidden_states_183.reshape(1, 16, 2, 128) + hidden_states_183 = None + getitem_120 = value_states_18[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_18 = None + hidden_states_184 = getitem_120.expand(1, 4, 4, 2, 128) + getitem_120 = None + value_36 = hidden_states_184.reshape(1, 16, 2, 128) + hidden_states_184 = None + attention_mask_19 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_18 = q_embed_14.contiguous() + q_embed_14 = None + key_37 = key_36.contiguous() + key_36 = None + value_37 = value_36.contiguous() + value_36 = None + attn_output_72 = torch._C._nn.scaled_dot_product_attention( + query_18, + key_37, + value_37, + attn_mask=attention_mask_19, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_18 = key_37 = value_37 = attention_mask_19 = None + transpose_76 = attn_output_72.transpose(1, 2) + attn_output_72 = None + attn_output_73 = transpose_76.contiguous() + transpose_76 = None + reshape_56 = attn_output_73.reshape(1, 2, -1) + attn_output_73 = None + attn_output_74 = reshape_56.contiguous() + reshape_56 = None + attn_output_75 = torch._C._nn.linear( + attn_output_74, + l_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_74 = l_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_185 = hidden_states_179 + attn_output_75 + hidden_states_179 = attn_output_75 = None + hidden_states_186 = hidden_states_185.to(torch.float32) + pow_38 = hidden_states_186.pow(2) + variance_37 = pow_38.mean(-1, keepdim=True) + pow_38 = None + add_104 = variance_37 + 1e-06 + variance_37 = None + rsqrt_37 = torch.rsqrt(add_104) + add_104 = None + hidden_states_187 = hidden_states_186 * rsqrt_37 + hidden_states_186 = rsqrt_37 = None + to_79 = hidden_states_187.to(torch.bfloat16) + hidden_states_187 = None + hidden_states_188 = ( + l_self_modules_model_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + * to_79 + ) + l_self_modules_model_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = ( + to_79 + ) = None + linear_130 = torch._C._nn.linear( + hidden_states_188, + l_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_18 = torch.nn.functional.silu(linear_130, inplace=False) + linear_130 = None + linear_131 = torch._C._nn.linear( + hidden_states_188, + l_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_188 = l_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_157 = silu_18 * linear_131 + silu_18 = linear_131 = None + down_proj_18 = torch._C._nn.linear( + mul_157, + l_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_157 = l_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_189 = hidden_states_185 + down_proj_18 + hidden_states_185 = down_proj_18 = None + hidden_states_190 = hidden_states_189.to(torch.float32) + pow_39 = hidden_states_190.pow(2) + variance_38 = pow_39.mean(-1, keepdim=True) + pow_39 = None + add_106 = variance_38 + 1e-06 + variance_38 = None + rsqrt_38 = torch.rsqrt(add_106) + add_106 = None + hidden_states_191 = hidden_states_190 * rsqrt_38 + hidden_states_190 = rsqrt_38 = None + to_81 = hidden_states_191.to(torch.bfloat16) + hidden_states_191 = None + hidden_states_192 = ( + l_self_modules_model_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + * to_81 + ) + l_self_modules_model_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + to_81 + ) = None + linear_133 = torch._C._nn.linear( + hidden_states_192, + l_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_58 = linear_133.view((1, 2, -1, 128)) + linear_133 = None + query_states_19 = view_58.transpose(1, 2) + view_58 = None + linear_134 = torch._C._nn.linear( + hidden_states_192, + l_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_59 = linear_134.view((1, 2, -1, 128)) + linear_134 = None + key_states_19 = view_59.transpose(1, 2) + view_59 = None + linear_135 = torch._C._nn.linear( + hidden_states_192, + l_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_192 = l_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_60 = linear_135.view((1, 2, -1, 128)) + linear_135 = None + value_states_19 = view_60.transpose(1, 2) + view_60 = None + getitem_122 = key_states_19[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + key_states_19 = None + hidden_states_193 = getitem_122.expand(1, 4, 4, 2, 128) + getitem_122 = None + key_38 = hidden_states_193.reshape(1, 16, 2, 128) + hidden_states_193 = None + getitem_123 = value_states_19[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_19 = None + hidden_states_194 = getitem_123.expand(1, 4, 4, 2, 128) + getitem_123 = None + value_38 = hidden_states_194.reshape(1, 16, 2, 128) + hidden_states_194 = None + attention_mask_20 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_19 = query_states_19.contiguous() + query_states_19 = None + key_39 = key_38.contiguous() + key_38 = None + value_39 = value_38.contiguous() + value_38 = None + attn_output_76 = torch._C._nn.scaled_dot_product_attention( + query_19, + key_39, + value_39, + attn_mask=attention_mask_20, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_19 = key_39 = value_39 = attention_mask_20 = None + transpose_80 = attn_output_76.transpose(1, 2) + attn_output_76 = None + attn_output_77 = transpose_80.contiguous() + transpose_80 = None + reshape_59 = attn_output_77.reshape(1, 2, -1) + attn_output_77 = None + attn_output_78 = reshape_59.contiguous() + reshape_59 = None + attn_output_79 = torch._C._nn.linear( + attn_output_78, + l_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_78 = l_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_195 = hidden_states_189 + attn_output_79 + hidden_states_189 = attn_output_79 = None + hidden_states_196 = hidden_states_195.to(torch.float32) + pow_40 = hidden_states_196.pow(2) + variance_39 = pow_40.mean(-1, keepdim=True) + pow_40 = None + add_108 = variance_39 + 1e-06 + variance_39 = None + rsqrt_39 = torch.rsqrt(add_108) + add_108 = None + hidden_states_197 = hidden_states_196 * rsqrt_39 + hidden_states_196 = rsqrt_39 = None + to_83 = hidden_states_197.to(torch.bfloat16) + hidden_states_197 = None + hidden_states_198 = ( + l_self_modules_model_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + * to_83 + ) + l_self_modules_model_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = ( + to_83 + ) = None + linear_137 = torch._C._nn.linear( + hidden_states_198, + l_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_19 = torch.nn.functional.silu(linear_137, inplace=False) + linear_137 = None + linear_138 = torch._C._nn.linear( + hidden_states_198, + l_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_198 = l_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_162 = silu_19 * linear_138 + silu_19 = linear_138 = None + down_proj_19 = torch._C._nn.linear( + mul_162, + l_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_162 = l_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_199 = hidden_states_195 + down_proj_19 + hidden_states_195 = down_proj_19 = None + hidden_states_200 = hidden_states_199.to(torch.float32) + pow_41 = hidden_states_200.pow(2) + variance_40 = pow_41.mean(-1, keepdim=True) + pow_41 = None + add_110 = variance_40 + 1e-06 + variance_40 = None + rsqrt_40 = torch.rsqrt(add_110) + add_110 = None + hidden_states_201 = hidden_states_200 * rsqrt_40 + hidden_states_200 = rsqrt_40 = None + to_85 = hidden_states_201.to(torch.bfloat16) + hidden_states_201 = None + hidden_states_202 = ( + l_self_modules_model_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + * to_85 + ) + l_self_modules_model_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + to_85 + ) = None + linear_140 = torch._C._nn.linear( + hidden_states_202, + l_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_61 = linear_140.view((1, 2, -1, 128)) + linear_140 = None + query_states_20 = view_61.transpose(1, 2) + view_61 = None + linear_141 = torch._C._nn.linear( + hidden_states_202, + l_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_62 = linear_141.view((1, 2, -1, 128)) + linear_141 = None + key_states_20 = view_62.transpose(1, 2) + view_62 = None + linear_142 = torch._C._nn.linear( + hidden_states_202, + l_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_202 = l_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_63 = linear_142.view((1, 2, -1, 128)) + linear_142 = None + value_states_20 = view_63.transpose(1, 2) + view_63 = None + cos_18 = cos_2.unsqueeze(1) + sin_18 = sin_2.unsqueeze(1) + mul_165 = query_states_20 * cos_18 + x1_30 = query_states_20[(Ellipsis, slice(None, 64, None))] + x2_30 = query_states_20[(Ellipsis, slice(64, None, None))] + query_states_20 = None + neg_30 = -x2_30 + x2_30 = None + cat_31 = torch.cat((neg_30, x1_30), dim=-1) + neg_30 = x1_30 = None + mul_166 = cat_31 * sin_18 + cat_31 = None + q_embed_15 = mul_165 + mul_166 + mul_165 = mul_166 = None + mul_167 = key_states_20 * cos_18 + cos_18 = None + x1_31 = key_states_20[(Ellipsis, slice(None, 64, None))] + x2_31 = key_states_20[(Ellipsis, slice(64, None, None))] + key_states_20 = None + neg_31 = -x2_31 + x2_31 = None + cat_32 = torch.cat((neg_31, x1_31), dim=-1) + neg_31 = x1_31 = None + mul_168 = cat_32 * sin_18 + cat_32 = sin_18 = None + k_embed_15 = mul_167 + mul_168 + mul_167 = mul_168 = None + getitem_129 = k_embed_15[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_15 = None + hidden_states_203 = getitem_129.expand(1, 4, 4, 2, 128) + getitem_129 = None + key_40 = hidden_states_203.reshape(1, 16, 2, 128) + hidden_states_203 = None + getitem_130 = value_states_20[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_20 = None + hidden_states_204 = getitem_130.expand(1, 4, 4, 2, 128) + getitem_130 = None + value_40 = hidden_states_204.reshape(1, 16, 2, 128) + hidden_states_204 = None + attention_mask_21 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_20 = q_embed_15.contiguous() + q_embed_15 = None + key_41 = key_40.contiguous() + key_40 = None + value_41 = value_40.contiguous() + value_40 = None + attn_output_80 = torch._C._nn.scaled_dot_product_attention( + query_20, + key_41, + value_41, + attn_mask=attention_mask_21, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_20 = key_41 = value_41 = attention_mask_21 = None + transpose_84 = attn_output_80.transpose(1, 2) + attn_output_80 = None + attn_output_81 = transpose_84.contiguous() + transpose_84 = None + reshape_62 = attn_output_81.reshape(1, 2, -1) + attn_output_81 = None + attn_output_82 = reshape_62.contiguous() + reshape_62 = None + attn_output_83 = torch._C._nn.linear( + attn_output_82, + l_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_82 = l_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_205 = hidden_states_199 + attn_output_83 + hidden_states_199 = attn_output_83 = None + hidden_states_206 = hidden_states_205.to(torch.float32) + pow_42 = hidden_states_206.pow(2) + variance_41 = pow_42.mean(-1, keepdim=True) + pow_42 = None + add_114 = variance_41 + 1e-06 + variance_41 = None + rsqrt_41 = torch.rsqrt(add_114) + add_114 = None + hidden_states_207 = hidden_states_206 * rsqrt_41 + hidden_states_206 = rsqrt_41 = None + to_87 = hidden_states_207.to(torch.bfloat16) + hidden_states_207 = None + hidden_states_208 = ( + l_self_modules_model_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + * to_87 + ) + l_self_modules_model_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = ( + to_87 + ) = None + linear_144 = torch._C._nn.linear( + hidden_states_208, + l_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_20 = torch.nn.functional.silu(linear_144, inplace=False) + linear_144 = None + linear_145 = torch._C._nn.linear( + hidden_states_208, + l_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_208 = l_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_171 = silu_20 * linear_145 + silu_20 = linear_145 = None + down_proj_20 = torch._C._nn.linear( + mul_171, + l_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_171 = l_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_209 = hidden_states_205 + down_proj_20 + hidden_states_205 = down_proj_20 = None + hidden_states_210 = hidden_states_209.to(torch.float32) + pow_43 = hidden_states_210.pow(2) + variance_42 = pow_43.mean(-1, keepdim=True) + pow_43 = None + add_116 = variance_42 + 1e-06 + variance_42 = None + rsqrt_42 = torch.rsqrt(add_116) + add_116 = None + hidden_states_211 = hidden_states_210 * rsqrt_42 + hidden_states_210 = rsqrt_42 = None + to_89 = hidden_states_211.to(torch.bfloat16) + hidden_states_211 = None + hidden_states_212 = ( + l_self_modules_model_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + * to_89 + ) + l_self_modules_model_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + to_89 + ) = None + linear_147 = torch._C._nn.linear( + hidden_states_212, + l_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_64 = linear_147.view((1, 2, -1, 128)) + linear_147 = None + query_states_21 = view_64.transpose(1, 2) + view_64 = None + linear_148 = torch._C._nn.linear( + hidden_states_212, + l_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_65 = linear_148.view((1, 2, -1, 128)) + linear_148 = None + key_states_21 = view_65.transpose(1, 2) + view_65 = None + linear_149 = torch._C._nn.linear( + hidden_states_212, + l_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_212 = l_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_66 = linear_149.view((1, 2, -1, 128)) + linear_149 = None + value_states_21 = view_66.transpose(1, 2) + view_66 = None + cos_19 = cos_2.unsqueeze(1) + sin_19 = sin_2.unsqueeze(1) + mul_174 = query_states_21 * cos_19 + x1_32 = query_states_21[(Ellipsis, slice(None, 64, None))] + x2_32 = query_states_21[(Ellipsis, slice(64, None, None))] + query_states_21 = None + neg_32 = -x2_32 + x2_32 = None + cat_33 = torch.cat((neg_32, x1_32), dim=-1) + neg_32 = x1_32 = None + mul_175 = cat_33 * sin_19 + cat_33 = None + q_embed_16 = mul_174 + mul_175 + mul_174 = mul_175 = None + mul_176 = key_states_21 * cos_19 + cos_19 = None + x1_33 = key_states_21[(Ellipsis, slice(None, 64, None))] + x2_33 = key_states_21[(Ellipsis, slice(64, None, None))] + key_states_21 = None + neg_33 = -x2_33 + x2_33 = None + cat_34 = torch.cat((neg_33, x1_33), dim=-1) + neg_33 = x1_33 = None + mul_177 = cat_34 * sin_19 + cat_34 = sin_19 = None + k_embed_16 = mul_176 + mul_177 + mul_176 = mul_177 = None + getitem_136 = k_embed_16[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_16 = None + hidden_states_213 = getitem_136.expand(1, 4, 4, 2, 128) + getitem_136 = None + key_42 = hidden_states_213.reshape(1, 16, 2, 128) + hidden_states_213 = None + getitem_137 = value_states_21[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_21 = None + hidden_states_214 = getitem_137.expand(1, 4, 4, 2, 128) + getitem_137 = None + value_42 = hidden_states_214.reshape(1, 16, 2, 128) + hidden_states_214 = None + attention_mask_22 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_21 = q_embed_16.contiguous() + q_embed_16 = None + key_43 = key_42.contiguous() + key_42 = None + value_43 = value_42.contiguous() + value_42 = None + attn_output_84 = torch._C._nn.scaled_dot_product_attention( + query_21, + key_43, + value_43, + attn_mask=attention_mask_22, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_21 = key_43 = value_43 = attention_mask_22 = None + transpose_88 = attn_output_84.transpose(1, 2) + attn_output_84 = None + attn_output_85 = transpose_88.contiguous() + transpose_88 = None + reshape_65 = attn_output_85.reshape(1, 2, -1) + attn_output_85 = None + attn_output_86 = reshape_65.contiguous() + reshape_65 = None + attn_output_87 = torch._C._nn.linear( + attn_output_86, + l_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_86 = l_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_215 = hidden_states_209 + attn_output_87 + hidden_states_209 = attn_output_87 = None + hidden_states_216 = hidden_states_215.to(torch.float32) + pow_44 = hidden_states_216.pow(2) + variance_43 = pow_44.mean(-1, keepdim=True) + pow_44 = None + add_120 = variance_43 + 1e-06 + variance_43 = None + rsqrt_43 = torch.rsqrt(add_120) + add_120 = None + hidden_states_217 = hidden_states_216 * rsqrt_43 + hidden_states_216 = rsqrt_43 = None + to_91 = hidden_states_217.to(torch.bfloat16) + hidden_states_217 = None + hidden_states_218 = ( + l_self_modules_model_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + * to_91 + ) + l_self_modules_model_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = ( + to_91 + ) = None + linear_151 = torch._C._nn.linear( + hidden_states_218, + l_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_21 = torch.nn.functional.silu(linear_151, inplace=False) + linear_151 = None + linear_152 = torch._C._nn.linear( + hidden_states_218, + l_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_218 = l_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_180 = silu_21 * linear_152 + silu_21 = linear_152 = None + down_proj_21 = torch._C._nn.linear( + mul_180, + l_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_180 = l_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_219 = hidden_states_215 + down_proj_21 + hidden_states_215 = down_proj_21 = None + hidden_states_220 = hidden_states_219.to(torch.float32) + pow_45 = hidden_states_220.pow(2) + variance_44 = pow_45.mean(-1, keepdim=True) + pow_45 = None + add_122 = variance_44 + 1e-06 + variance_44 = None + rsqrt_44 = torch.rsqrt(add_122) + add_122 = None + hidden_states_221 = hidden_states_220 * rsqrt_44 + hidden_states_220 = rsqrt_44 = None + to_93 = hidden_states_221.to(torch.bfloat16) + hidden_states_221 = None + hidden_states_222 = ( + l_self_modules_model_modules_model_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + * to_93 + ) + l_self_modules_model_modules_model_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + to_93 + ) = None + linear_154 = torch._C._nn.linear( + hidden_states_222, + l_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_67 = linear_154.view((1, 2, -1, 128)) + linear_154 = None + query_states_22 = view_67.transpose(1, 2) + view_67 = None + linear_155 = torch._C._nn.linear( + hidden_states_222, + l_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_68 = linear_155.view((1, 2, -1, 128)) + linear_155 = None + key_states_22 = view_68.transpose(1, 2) + view_68 = None + linear_156 = torch._C._nn.linear( + hidden_states_222, + l_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_222 = l_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_69 = linear_156.view((1, 2, -1, 128)) + linear_156 = None + value_states_22 = view_69.transpose(1, 2) + view_69 = None + cos_20 = cos_2.unsqueeze(1) + sin_20 = sin_2.unsqueeze(1) + mul_183 = query_states_22 * cos_20 + x1_34 = query_states_22[(Ellipsis, slice(None, 64, None))] + x2_34 = query_states_22[(Ellipsis, slice(64, None, None))] + query_states_22 = None + neg_34 = -x2_34 + x2_34 = None + cat_35 = torch.cat((neg_34, x1_34), dim=-1) + neg_34 = x1_34 = None + mul_184 = cat_35 * sin_20 + cat_35 = None + q_embed_17 = mul_183 + mul_184 + mul_183 = mul_184 = None + mul_185 = key_states_22 * cos_20 + cos_20 = None + x1_35 = key_states_22[(Ellipsis, slice(None, 64, None))] + x2_35 = key_states_22[(Ellipsis, slice(64, None, None))] + key_states_22 = None + neg_35 = -x2_35 + x2_35 = None + cat_36 = torch.cat((neg_35, x1_35), dim=-1) + neg_35 = x1_35 = None + mul_186 = cat_36 * sin_20 + cat_36 = sin_20 = None + k_embed_17 = mul_185 + mul_186 + mul_185 = mul_186 = None + getitem_143 = k_embed_17[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_17 = None + hidden_states_223 = getitem_143.expand(1, 4, 4, 2, 128) + getitem_143 = None + key_44 = hidden_states_223.reshape(1, 16, 2, 128) + hidden_states_223 = None + getitem_144 = value_states_22[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_22 = None + hidden_states_224 = getitem_144.expand(1, 4, 4, 2, 128) + getitem_144 = None + value_44 = hidden_states_224.reshape(1, 16, 2, 128) + hidden_states_224 = None + attention_mask_23 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_22 = q_embed_17.contiguous() + q_embed_17 = None + key_45 = key_44.contiguous() + key_44 = None + value_45 = value_44.contiguous() + value_44 = None + attn_output_88 = torch._C._nn.scaled_dot_product_attention( + query_22, + key_45, + value_45, + attn_mask=attention_mask_23, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_22 = key_45 = value_45 = attention_mask_23 = None + transpose_92 = attn_output_88.transpose(1, 2) + attn_output_88 = None + attn_output_89 = transpose_92.contiguous() + transpose_92 = None + reshape_68 = attn_output_89.reshape(1, 2, -1) + attn_output_89 = None + attn_output_90 = reshape_68.contiguous() + reshape_68 = None + attn_output_91 = torch._C._nn.linear( + attn_output_90, + l_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_90 = l_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_225 = hidden_states_219 + attn_output_91 + hidden_states_219 = attn_output_91 = None + hidden_states_226 = hidden_states_225.to(torch.float32) + pow_46 = hidden_states_226.pow(2) + variance_45 = pow_46.mean(-1, keepdim=True) + pow_46 = None + add_126 = variance_45 + 1e-06 + variance_45 = None + rsqrt_45 = torch.rsqrt(add_126) + add_126 = None + hidden_states_227 = hidden_states_226 * rsqrt_45 + hidden_states_226 = rsqrt_45 = None + to_95 = hidden_states_227.to(torch.bfloat16) + hidden_states_227 = None + hidden_states_228 = ( + l_self_modules_model_modules_model_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ + * to_95 + ) + l_self_modules_model_modules_model_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = ( + to_95 + ) = None + linear_158 = torch._C._nn.linear( + hidden_states_228, + l_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_22 = torch.nn.functional.silu(linear_158, inplace=False) + linear_158 = None + linear_159 = torch._C._nn.linear( + hidden_states_228, + l_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_228 = l_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_189 = silu_22 * linear_159 + silu_22 = linear_159 = None + down_proj_22 = torch._C._nn.linear( + mul_189, + l_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_189 = l_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_229 = hidden_states_225 + down_proj_22 + hidden_states_225 = down_proj_22 = None + hidden_states_230 = hidden_states_229.to(torch.float32) + pow_47 = hidden_states_230.pow(2) + variance_46 = pow_47.mean(-1, keepdim=True) + pow_47 = None + add_128 = variance_46 + 1e-06 + variance_46 = None + rsqrt_46 = torch.rsqrt(add_128) + add_128 = None + hidden_states_231 = hidden_states_230 * rsqrt_46 + hidden_states_230 = rsqrt_46 = None + to_97 = hidden_states_231.to(torch.bfloat16) + hidden_states_231 = None + hidden_states_232 = ( + l_self_modules_model_modules_model_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + * to_97 + ) + l_self_modules_model_modules_model_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + to_97 + ) = None + linear_161 = torch._C._nn.linear( + hidden_states_232, + l_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_70 = linear_161.view((1, 2, -1, 128)) + linear_161 = None + query_states_23 = view_70.transpose(1, 2) + view_70 = None + linear_162 = torch._C._nn.linear( + hidden_states_232, + l_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_71 = linear_162.view((1, 2, -1, 128)) + linear_162 = None + key_states_23 = view_71.transpose(1, 2) + view_71 = None + linear_163 = torch._C._nn.linear( + hidden_states_232, + l_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_232 = l_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_72 = linear_163.view((1, 2, -1, 128)) + linear_163 = None + value_states_23 = view_72.transpose(1, 2) + view_72 = None + getitem_146 = key_states_23[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + key_states_23 = None + hidden_states_233 = getitem_146.expand(1, 4, 4, 2, 128) + getitem_146 = None + key_46 = hidden_states_233.reshape(1, 16, 2, 128) + hidden_states_233 = None + getitem_147 = value_states_23[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_23 = None + hidden_states_234 = getitem_147.expand(1, 4, 4, 2, 128) + getitem_147 = None + value_46 = hidden_states_234.reshape(1, 16, 2, 128) + hidden_states_234 = None + attention_mask_24 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_23 = query_states_23.contiguous() + query_states_23 = None + key_47 = key_46.contiguous() + key_46 = None + value_47 = value_46.contiguous() + value_46 = None + attn_output_92 = torch._C._nn.scaled_dot_product_attention( + query_23, + key_47, + value_47, + attn_mask=attention_mask_24, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_23 = key_47 = value_47 = attention_mask_24 = None + transpose_96 = attn_output_92.transpose(1, 2) + attn_output_92 = None + attn_output_93 = transpose_96.contiguous() + transpose_96 = None + reshape_71 = attn_output_93.reshape(1, 2, -1) + attn_output_93 = None + attn_output_94 = reshape_71.contiguous() + reshape_71 = None + attn_output_95 = torch._C._nn.linear( + attn_output_94, + l_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_94 = l_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_235 = hidden_states_229 + attn_output_95 + hidden_states_229 = attn_output_95 = None + hidden_states_236 = hidden_states_235.to(torch.float32) + pow_48 = hidden_states_236.pow(2) + variance_47 = pow_48.mean(-1, keepdim=True) + pow_48 = None + add_130 = variance_47 + 1e-06 + variance_47 = None + rsqrt_47 = torch.rsqrt(add_130) + add_130 = None + hidden_states_237 = hidden_states_236 * rsqrt_47 + hidden_states_236 = rsqrt_47 = None + to_99 = hidden_states_237.to(torch.bfloat16) + hidden_states_237 = None + hidden_states_238 = ( + l_self_modules_model_modules_model_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ + * to_99 + ) + l_self_modules_model_modules_model_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = ( + to_99 + ) = None + linear_165 = torch._C._nn.linear( + hidden_states_238, + l_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_23 = torch.nn.functional.silu(linear_165, inplace=False) + linear_165 = None + linear_166 = torch._C._nn.linear( + hidden_states_238, + l_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_238 = l_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_194 = silu_23 * linear_166 + silu_23 = linear_166 = None + down_proj_23 = torch._C._nn.linear( + mul_194, + l_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_194 = l_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_239 = hidden_states_235 + down_proj_23 + hidden_states_235 = down_proj_23 = None + hidden_states_240 = hidden_states_239.to(torch.float32) + pow_49 = hidden_states_240.pow(2) + variance_48 = pow_49.mean(-1, keepdim=True) + pow_49 = None + add_132 = variance_48 + 1e-06 + variance_48 = None + rsqrt_48 = torch.rsqrt(add_132) + add_132 = None + hidden_states_241 = hidden_states_240 * rsqrt_48 + hidden_states_240 = rsqrt_48 = None + to_101 = hidden_states_241.to(torch.bfloat16) + hidden_states_241 = None + hidden_states_242 = ( + l_self_modules_model_modules_model_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ + * to_101 + ) + l_self_modules_model_modules_model_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ = ( + to_101 + ) = None + linear_168 = torch._C._nn.linear( + hidden_states_242, + l_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_73 = linear_168.view((1, 2, -1, 128)) + linear_168 = None + query_states_24 = view_73.transpose(1, 2) + view_73 = None + linear_169 = torch._C._nn.linear( + hidden_states_242, + l_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_74 = linear_169.view((1, 2, -1, 128)) + linear_169 = None + key_states_24 = view_74.transpose(1, 2) + view_74 = None + linear_170 = torch._C._nn.linear( + hidden_states_242, + l_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_242 = l_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_75 = linear_170.view((1, 2, -1, 128)) + linear_170 = None + value_states_24 = view_75.transpose(1, 2) + view_75 = None + cos_21 = cos_2.unsqueeze(1) + sin_21 = sin_2.unsqueeze(1) + mul_197 = query_states_24 * cos_21 + x1_36 = query_states_24[(Ellipsis, slice(None, 64, None))] + x2_36 = query_states_24[(Ellipsis, slice(64, None, None))] + query_states_24 = None + neg_36 = -x2_36 + x2_36 = None + cat_37 = torch.cat((neg_36, x1_36), dim=-1) + neg_36 = x1_36 = None + mul_198 = cat_37 * sin_21 + cat_37 = None + q_embed_18 = mul_197 + mul_198 + mul_197 = mul_198 = None + mul_199 = key_states_24 * cos_21 + cos_21 = None + x1_37 = key_states_24[(Ellipsis, slice(None, 64, None))] + x2_37 = key_states_24[(Ellipsis, slice(64, None, None))] + key_states_24 = None + neg_37 = -x2_37 + x2_37 = None + cat_38 = torch.cat((neg_37, x1_37), dim=-1) + neg_37 = x1_37 = None + mul_200 = cat_38 * sin_21 + cat_38 = sin_21 = None + k_embed_18 = mul_199 + mul_200 + mul_199 = mul_200 = None + getitem_153 = k_embed_18[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_18 = None + hidden_states_243 = getitem_153.expand(1, 4, 4, 2, 128) + getitem_153 = None + key_48 = hidden_states_243.reshape(1, 16, 2, 128) + hidden_states_243 = None + getitem_154 = value_states_24[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_24 = None + hidden_states_244 = getitem_154.expand(1, 4, 4, 2, 128) + getitem_154 = None + value_48 = hidden_states_244.reshape(1, 16, 2, 128) + hidden_states_244 = None + attention_mask_25 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_24 = q_embed_18.contiguous() + q_embed_18 = None + key_49 = key_48.contiguous() + key_48 = None + value_49 = value_48.contiguous() + value_48 = None + attn_output_96 = torch._C._nn.scaled_dot_product_attention( + query_24, + key_49, + value_49, + attn_mask=attention_mask_25, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_24 = key_49 = value_49 = attention_mask_25 = None + transpose_100 = attn_output_96.transpose(1, 2) + attn_output_96 = None + attn_output_97 = transpose_100.contiguous() + transpose_100 = None + reshape_74 = attn_output_97.reshape(1, 2, -1) + attn_output_97 = None + attn_output_98 = reshape_74.contiguous() + reshape_74 = None + attn_output_99 = torch._C._nn.linear( + attn_output_98, + l_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_98 = l_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_245 = hidden_states_239 + attn_output_99 + hidden_states_239 = attn_output_99 = None + hidden_states_246 = hidden_states_245.to(torch.float32) + pow_50 = hidden_states_246.pow(2) + variance_49 = pow_50.mean(-1, keepdim=True) + pow_50 = None + add_136 = variance_49 + 1e-06 + variance_49 = None + rsqrt_49 = torch.rsqrt(add_136) + add_136 = None + hidden_states_247 = hidden_states_246 * rsqrt_49 + hidden_states_246 = rsqrt_49 = None + to_103 = hidden_states_247.to(torch.bfloat16) + hidden_states_247 = None + hidden_states_248 = ( + l_self_modules_model_modules_model_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ + * to_103 + ) + l_self_modules_model_modules_model_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ = ( + to_103 + ) = None + linear_172 = torch._C._nn.linear( + hidden_states_248, + l_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_24 = torch.nn.functional.silu(linear_172, inplace=False) + linear_172 = None + linear_173 = torch._C._nn.linear( + hidden_states_248, + l_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_248 = l_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_203 = silu_24 * linear_173 + silu_24 = linear_173 = None + down_proj_24 = torch._C._nn.linear( + mul_203, + l_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_203 = l_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_249 = hidden_states_245 + down_proj_24 + hidden_states_245 = down_proj_24 = None + hidden_states_250 = hidden_states_249.to(torch.float32) + pow_51 = hidden_states_250.pow(2) + variance_50 = pow_51.mean(-1, keepdim=True) + pow_51 = None + add_138 = variance_50 + 1e-06 + variance_50 = None + rsqrt_50 = torch.rsqrt(add_138) + add_138 = None + hidden_states_251 = hidden_states_250 * rsqrt_50 + hidden_states_250 = rsqrt_50 = None + to_105 = hidden_states_251.to(torch.bfloat16) + hidden_states_251 = None + hidden_states_252 = ( + l_self_modules_model_modules_model_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ + * to_105 + ) + l_self_modules_model_modules_model_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ = ( + to_105 + ) = None + linear_175 = torch._C._nn.linear( + hidden_states_252, + l_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_76 = linear_175.view((1, 2, -1, 128)) + linear_175 = None + query_states_25 = view_76.transpose(1, 2) + view_76 = None + linear_176 = torch._C._nn.linear( + hidden_states_252, + l_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_77 = linear_176.view((1, 2, -1, 128)) + linear_176 = None + key_states_25 = view_77.transpose(1, 2) + view_77 = None + linear_177 = torch._C._nn.linear( + hidden_states_252, + l_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_252 = l_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_78 = linear_177.view((1, 2, -1, 128)) + linear_177 = None + value_states_25 = view_78.transpose(1, 2) + view_78 = None + cos_22 = cos_2.unsqueeze(1) + sin_22 = sin_2.unsqueeze(1) + mul_206 = query_states_25 * cos_22 + x1_38 = query_states_25[(Ellipsis, slice(None, 64, None))] + x2_38 = query_states_25[(Ellipsis, slice(64, None, None))] + query_states_25 = None + neg_38 = -x2_38 + x2_38 = None + cat_39 = torch.cat((neg_38, x1_38), dim=-1) + neg_38 = x1_38 = None + mul_207 = cat_39 * sin_22 + cat_39 = None + q_embed_19 = mul_206 + mul_207 + mul_206 = mul_207 = None + mul_208 = key_states_25 * cos_22 + cos_22 = None + x1_39 = key_states_25[(Ellipsis, slice(None, 64, None))] + x2_39 = key_states_25[(Ellipsis, slice(64, None, None))] + key_states_25 = None + neg_39 = -x2_39 + x2_39 = None + cat_40 = torch.cat((neg_39, x1_39), dim=-1) + neg_39 = x1_39 = None + mul_209 = cat_40 * sin_22 + cat_40 = sin_22 = None + k_embed_19 = mul_208 + mul_209 + mul_208 = mul_209 = None + getitem_160 = k_embed_19[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_19 = None + hidden_states_253 = getitem_160.expand(1, 4, 4, 2, 128) + getitem_160 = None + key_50 = hidden_states_253.reshape(1, 16, 2, 128) + hidden_states_253 = None + getitem_161 = value_states_25[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_25 = None + hidden_states_254 = getitem_161.expand(1, 4, 4, 2, 128) + getitem_161 = None + value_50 = hidden_states_254.reshape(1, 16, 2, 128) + hidden_states_254 = None + attention_mask_26 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_25 = q_embed_19.contiguous() + q_embed_19 = None + key_51 = key_50.contiguous() + key_50 = None + value_51 = value_50.contiguous() + value_50 = None + attn_output_100 = torch._C._nn.scaled_dot_product_attention( + query_25, + key_51, + value_51, + attn_mask=attention_mask_26, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_25 = key_51 = value_51 = attention_mask_26 = None + transpose_104 = attn_output_100.transpose(1, 2) + attn_output_100 = None + attn_output_101 = transpose_104.contiguous() + transpose_104 = None + reshape_77 = attn_output_101.reshape(1, 2, -1) + attn_output_101 = None + attn_output_102 = reshape_77.contiguous() + reshape_77 = None + attn_output_103 = torch._C._nn.linear( + attn_output_102, + l_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_102 = l_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_255 = hidden_states_249 + attn_output_103 + hidden_states_249 = attn_output_103 = None + hidden_states_256 = hidden_states_255.to(torch.float32) + pow_52 = hidden_states_256.pow(2) + variance_51 = pow_52.mean(-1, keepdim=True) + pow_52 = None + add_142 = variance_51 + 1e-06 + variance_51 = None + rsqrt_51 = torch.rsqrt(add_142) + add_142 = None + hidden_states_257 = hidden_states_256 * rsqrt_51 + hidden_states_256 = rsqrt_51 = None + to_107 = hidden_states_257.to(torch.bfloat16) + hidden_states_257 = None + hidden_states_258 = ( + l_self_modules_model_modules_model_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ + * to_107 + ) + l_self_modules_model_modules_model_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ = ( + to_107 + ) = None + linear_179 = torch._C._nn.linear( + hidden_states_258, + l_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_25 = torch.nn.functional.silu(linear_179, inplace=False) + linear_179 = None + linear_180 = torch._C._nn.linear( + hidden_states_258, + l_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_258 = l_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_212 = silu_25 * linear_180 + silu_25 = linear_180 = None + down_proj_25 = torch._C._nn.linear( + mul_212, + l_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_212 = l_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_259 = hidden_states_255 + down_proj_25 + hidden_states_255 = down_proj_25 = None + hidden_states_260 = hidden_states_259.to(torch.float32) + pow_53 = hidden_states_260.pow(2) + variance_52 = pow_53.mean(-1, keepdim=True) + pow_53 = None + add_144 = variance_52 + 1e-06 + variance_52 = None + rsqrt_52 = torch.rsqrt(add_144) + add_144 = None + hidden_states_261 = hidden_states_260 * rsqrt_52 + hidden_states_260 = rsqrt_52 = None + to_109 = hidden_states_261.to(torch.bfloat16) + hidden_states_261 = None + hidden_states_262 = ( + l_self_modules_model_modules_model_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ + * to_109 + ) + l_self_modules_model_modules_model_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ = ( + to_109 + ) = None + linear_182 = torch._C._nn.linear( + hidden_states_262, + l_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_79 = linear_182.view((1, 2, -1, 128)) + linear_182 = None + query_states_26 = view_79.transpose(1, 2) + view_79 = None + linear_183 = torch._C._nn.linear( + hidden_states_262, + l_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_80 = linear_183.view((1, 2, -1, 128)) + linear_183 = None + key_states_26 = view_80.transpose(1, 2) + view_80 = None + linear_184 = torch._C._nn.linear( + hidden_states_262, + l_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_262 = l_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_81 = linear_184.view((1, 2, -1, 128)) + linear_184 = None + value_states_26 = view_81.transpose(1, 2) + view_81 = None + cos_23 = cos_2.unsqueeze(1) + sin_23 = sin_2.unsqueeze(1) + mul_215 = query_states_26 * cos_23 + x1_40 = query_states_26[(Ellipsis, slice(None, 64, None))] + x2_40 = query_states_26[(Ellipsis, slice(64, None, None))] + query_states_26 = None + neg_40 = -x2_40 + x2_40 = None + cat_41 = torch.cat((neg_40, x1_40), dim=-1) + neg_40 = x1_40 = None + mul_216 = cat_41 * sin_23 + cat_41 = None + q_embed_20 = mul_215 + mul_216 + mul_215 = mul_216 = None + mul_217 = key_states_26 * cos_23 + cos_23 = None + x1_41 = key_states_26[(Ellipsis, slice(None, 64, None))] + x2_41 = key_states_26[(Ellipsis, slice(64, None, None))] + key_states_26 = None + neg_41 = -x2_41 + x2_41 = None + cat_42 = torch.cat((neg_41, x1_41), dim=-1) + neg_41 = x1_41 = None + mul_218 = cat_42 * sin_23 + cat_42 = sin_23 = None + k_embed_20 = mul_217 + mul_218 + mul_217 = mul_218 = None + getitem_167 = k_embed_20[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_20 = None + hidden_states_263 = getitem_167.expand(1, 4, 4, 2, 128) + getitem_167 = None + key_52 = hidden_states_263.reshape(1, 16, 2, 128) + hidden_states_263 = None + getitem_168 = value_states_26[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_26 = None + hidden_states_264 = getitem_168.expand(1, 4, 4, 2, 128) + getitem_168 = None + value_52 = hidden_states_264.reshape(1, 16, 2, 128) + hidden_states_264 = None + attention_mask_27 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_26 = q_embed_20.contiguous() + q_embed_20 = None + key_53 = key_52.contiguous() + key_52 = None + value_53 = value_52.contiguous() + value_52 = None + attn_output_104 = torch._C._nn.scaled_dot_product_attention( + query_26, + key_53, + value_53, + attn_mask=attention_mask_27, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_26 = key_53 = value_53 = attention_mask_27 = None + transpose_108 = attn_output_104.transpose(1, 2) + attn_output_104 = None + attn_output_105 = transpose_108.contiguous() + transpose_108 = None + reshape_80 = attn_output_105.reshape(1, 2, -1) + attn_output_105 = None + attn_output_106 = reshape_80.contiguous() + reshape_80 = None + attn_output_107 = torch._C._nn.linear( + attn_output_106, + l_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_106 = l_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_265 = hidden_states_259 + attn_output_107 + hidden_states_259 = attn_output_107 = None + hidden_states_266 = hidden_states_265.to(torch.float32) + pow_54 = hidden_states_266.pow(2) + variance_53 = pow_54.mean(-1, keepdim=True) + pow_54 = None + add_148 = variance_53 + 1e-06 + variance_53 = None + rsqrt_53 = torch.rsqrt(add_148) + add_148 = None + hidden_states_267 = hidden_states_266 * rsqrt_53 + hidden_states_266 = rsqrt_53 = None + to_111 = hidden_states_267.to(torch.bfloat16) + hidden_states_267 = None + hidden_states_268 = ( + l_self_modules_model_modules_model_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ + * to_111 + ) + l_self_modules_model_modules_model_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ = ( + to_111 + ) = None + linear_186 = torch._C._nn.linear( + hidden_states_268, + l_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_26 = torch.nn.functional.silu(linear_186, inplace=False) + linear_186 = None + linear_187 = torch._C._nn.linear( + hidden_states_268, + l_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_268 = l_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_221 = silu_26 * linear_187 + silu_26 = linear_187 = None + down_proj_26 = torch._C._nn.linear( + mul_221, + l_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_221 = l_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_269 = hidden_states_265 + down_proj_26 + hidden_states_265 = down_proj_26 = None + hidden_states_270 = hidden_states_269.to(torch.float32) + pow_55 = hidden_states_270.pow(2) + variance_54 = pow_55.mean(-1, keepdim=True) + pow_55 = None + add_150 = variance_54 + 1e-06 + variance_54 = None + rsqrt_54 = torch.rsqrt(add_150) + add_150 = None + hidden_states_271 = hidden_states_270 * rsqrt_54 + hidden_states_270 = rsqrt_54 = None + to_113 = hidden_states_271.to(torch.bfloat16) + hidden_states_271 = None + hidden_states_272 = ( + l_self_modules_model_modules_model_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ + * to_113 + ) + l_self_modules_model_modules_model_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ = ( + to_113 + ) = None + linear_189 = torch._C._nn.linear( + hidden_states_272, + l_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_82 = linear_189.view((1, 2, -1, 128)) + linear_189 = None + query_states_27 = view_82.transpose(1, 2) + view_82 = None + linear_190 = torch._C._nn.linear( + hidden_states_272, + l_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_83 = linear_190.view((1, 2, -1, 128)) + linear_190 = None + key_states_27 = view_83.transpose(1, 2) + view_83 = None + linear_191 = torch._C._nn.linear( + hidden_states_272, + l_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_272 = l_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_84 = linear_191.view((1, 2, -1, 128)) + linear_191 = None + value_states_27 = view_84.transpose(1, 2) + view_84 = None + getitem_170 = key_states_27[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + key_states_27 = None + hidden_states_273 = getitem_170.expand(1, 4, 4, 2, 128) + getitem_170 = None + key_54 = hidden_states_273.reshape(1, 16, 2, 128) + hidden_states_273 = None + getitem_171 = value_states_27[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_27 = None + hidden_states_274 = getitem_171.expand(1, 4, 4, 2, 128) + getitem_171 = None + value_54 = hidden_states_274.reshape(1, 16, 2, 128) + hidden_states_274 = None + attention_mask_28 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_27 = query_states_27.contiguous() + query_states_27 = None + key_55 = key_54.contiguous() + key_54 = None + value_55 = value_54.contiguous() + value_54 = None + attn_output_108 = torch._C._nn.scaled_dot_product_attention( + query_27, + key_55, + value_55, + attn_mask=attention_mask_28, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_27 = key_55 = value_55 = attention_mask_28 = None + transpose_112 = attn_output_108.transpose(1, 2) + attn_output_108 = None + attn_output_109 = transpose_112.contiguous() + transpose_112 = None + reshape_83 = attn_output_109.reshape(1, 2, -1) + attn_output_109 = None + attn_output_110 = reshape_83.contiguous() + reshape_83 = None + attn_output_111 = torch._C._nn.linear( + attn_output_110, + l_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_110 = l_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_275 = hidden_states_269 + attn_output_111 + hidden_states_269 = attn_output_111 = None + hidden_states_276 = hidden_states_275.to(torch.float32) + pow_56 = hidden_states_276.pow(2) + variance_55 = pow_56.mean(-1, keepdim=True) + pow_56 = None + add_152 = variance_55 + 1e-06 + variance_55 = None + rsqrt_55 = torch.rsqrt(add_152) + add_152 = None + hidden_states_277 = hidden_states_276 * rsqrt_55 + hidden_states_276 = rsqrt_55 = None + to_115 = hidden_states_277.to(torch.bfloat16) + hidden_states_277 = None + hidden_states_278 = ( + l_self_modules_model_modules_model_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ + * to_115 + ) + l_self_modules_model_modules_model_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ = ( + to_115 + ) = None + linear_193 = torch._C._nn.linear( + hidden_states_278, + l_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_27 = torch.nn.functional.silu(linear_193, inplace=False) + linear_193 = None + linear_194 = torch._C._nn.linear( + hidden_states_278, + l_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_278 = l_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_226 = silu_27 * linear_194 + silu_27 = linear_194 = None + down_proj_27 = torch._C._nn.linear( + mul_226, + l_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_226 = l_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_279 = hidden_states_275 + down_proj_27 + hidden_states_275 = down_proj_27 = None + hidden_states_280 = hidden_states_279.to(torch.float32) + pow_57 = hidden_states_280.pow(2) + variance_56 = pow_57.mean(-1, keepdim=True) + pow_57 = None + add_154 = variance_56 + 1e-06 + variance_56 = None + rsqrt_56 = torch.rsqrt(add_154) + add_154 = None + hidden_states_281 = hidden_states_280 * rsqrt_56 + hidden_states_280 = rsqrt_56 = None + to_117 = hidden_states_281.to(torch.bfloat16) + hidden_states_281 = None + hidden_states_282 = ( + l_self_modules_model_modules_model_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ + * to_117 + ) + l_self_modules_model_modules_model_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ = ( + to_117 + ) = None + linear_196 = torch._C._nn.linear( + hidden_states_282, + l_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_85 = linear_196.view((1, 2, -1, 128)) + linear_196 = None + query_states_28 = view_85.transpose(1, 2) + view_85 = None + linear_197 = torch._C._nn.linear( + hidden_states_282, + l_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_86 = linear_197.view((1, 2, -1, 128)) + linear_197 = None + key_states_28 = view_86.transpose(1, 2) + view_86 = None + linear_198 = torch._C._nn.linear( + hidden_states_282, + l_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_282 = l_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_87 = linear_198.view((1, 2, -1, 128)) + linear_198 = None + value_states_28 = view_87.transpose(1, 2) + view_87 = None + cos_24 = cos_2.unsqueeze(1) + sin_24 = sin_2.unsqueeze(1) + mul_229 = query_states_28 * cos_24 + x1_42 = query_states_28[(Ellipsis, slice(None, 64, None))] + x2_42 = query_states_28[(Ellipsis, slice(64, None, None))] + query_states_28 = None + neg_42 = -x2_42 + x2_42 = None + cat_43 = torch.cat((neg_42, x1_42), dim=-1) + neg_42 = x1_42 = None + mul_230 = cat_43 * sin_24 + cat_43 = None + q_embed_21 = mul_229 + mul_230 + mul_229 = mul_230 = None + mul_231 = key_states_28 * cos_24 + cos_24 = None + x1_43 = key_states_28[(Ellipsis, slice(None, 64, None))] + x2_43 = key_states_28[(Ellipsis, slice(64, None, None))] + key_states_28 = None + neg_43 = -x2_43 + x2_43 = None + cat_44 = torch.cat((neg_43, x1_43), dim=-1) + neg_43 = x1_43 = None + mul_232 = cat_44 * sin_24 + cat_44 = sin_24 = None + k_embed_21 = mul_231 + mul_232 + mul_231 = mul_232 = None + getitem_177 = k_embed_21[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_21 = None + hidden_states_283 = getitem_177.expand(1, 4, 4, 2, 128) + getitem_177 = None + key_56 = hidden_states_283.reshape(1, 16, 2, 128) + hidden_states_283 = None + getitem_178 = value_states_28[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_28 = None + hidden_states_284 = getitem_178.expand(1, 4, 4, 2, 128) + getitem_178 = None + value_56 = hidden_states_284.reshape(1, 16, 2, 128) + hidden_states_284 = None + attention_mask_29 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_28 = q_embed_21.contiguous() + q_embed_21 = None + key_57 = key_56.contiguous() + key_56 = None + value_57 = value_56.contiguous() + value_56 = None + attn_output_112 = torch._C._nn.scaled_dot_product_attention( + query_28, + key_57, + value_57, + attn_mask=attention_mask_29, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_28 = key_57 = value_57 = attention_mask_29 = None + transpose_116 = attn_output_112.transpose(1, 2) + attn_output_112 = None + attn_output_113 = transpose_116.contiguous() + transpose_116 = None + reshape_86 = attn_output_113.reshape(1, 2, -1) + attn_output_113 = None + attn_output_114 = reshape_86.contiguous() + reshape_86 = None + attn_output_115 = torch._C._nn.linear( + attn_output_114, + l_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_114 = l_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_285 = hidden_states_279 + attn_output_115 + hidden_states_279 = attn_output_115 = None + hidden_states_286 = hidden_states_285.to(torch.float32) + pow_58 = hidden_states_286.pow(2) + variance_57 = pow_58.mean(-1, keepdim=True) + pow_58 = None + add_158 = variance_57 + 1e-06 + variance_57 = None + rsqrt_57 = torch.rsqrt(add_158) + add_158 = None + hidden_states_287 = hidden_states_286 * rsqrt_57 + hidden_states_286 = rsqrt_57 = None + to_119 = hidden_states_287.to(torch.bfloat16) + hidden_states_287 = None + hidden_states_288 = ( + l_self_modules_model_modules_model_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ + * to_119 + ) + l_self_modules_model_modules_model_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ = ( + to_119 + ) = None + linear_200 = torch._C._nn.linear( + hidden_states_288, + l_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_28 = torch.nn.functional.silu(linear_200, inplace=False) + linear_200 = None + linear_201 = torch._C._nn.linear( + hidden_states_288, + l_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_288 = l_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_235 = silu_28 * linear_201 + silu_28 = linear_201 = None + down_proj_28 = torch._C._nn.linear( + mul_235, + l_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_235 = l_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_289 = hidden_states_285 + down_proj_28 + hidden_states_285 = down_proj_28 = None + hidden_states_290 = hidden_states_289.to(torch.float32) + pow_59 = hidden_states_290.pow(2) + variance_58 = pow_59.mean(-1, keepdim=True) + pow_59 = None + add_160 = variance_58 + 1e-06 + variance_58 = None + rsqrt_58 = torch.rsqrt(add_160) + add_160 = None + hidden_states_291 = hidden_states_290 * rsqrt_58 + hidden_states_290 = rsqrt_58 = None + to_121 = hidden_states_291.to(torch.bfloat16) + hidden_states_291 = None + hidden_states_292 = ( + l_self_modules_model_modules_model_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ + * to_121 + ) + l_self_modules_model_modules_model_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ = ( + to_121 + ) = None + linear_203 = torch._C._nn.linear( + hidden_states_292, + l_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_88 = linear_203.view((1, 2, -1, 128)) + linear_203 = None + query_states_29 = view_88.transpose(1, 2) + view_88 = None + linear_204 = torch._C._nn.linear( + hidden_states_292, + l_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_89 = linear_204.view((1, 2, -1, 128)) + linear_204 = None + key_states_29 = view_89.transpose(1, 2) + view_89 = None + linear_205 = torch._C._nn.linear( + hidden_states_292, + l_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_292 = l_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_90 = linear_205.view((1, 2, -1, 128)) + linear_205 = None + value_states_29 = view_90.transpose(1, 2) + view_90 = None + cos_25 = cos_2.unsqueeze(1) + sin_25 = sin_2.unsqueeze(1) + mul_238 = query_states_29 * cos_25 + x1_44 = query_states_29[(Ellipsis, slice(None, 64, None))] + x2_44 = query_states_29[(Ellipsis, slice(64, None, None))] + query_states_29 = None + neg_44 = -x2_44 + x2_44 = None + cat_45 = torch.cat((neg_44, x1_44), dim=-1) + neg_44 = x1_44 = None + mul_239 = cat_45 * sin_25 + cat_45 = None + q_embed_22 = mul_238 + mul_239 + mul_238 = mul_239 = None + mul_240 = key_states_29 * cos_25 + cos_25 = None + x1_45 = key_states_29[(Ellipsis, slice(None, 64, None))] + x2_45 = key_states_29[(Ellipsis, slice(64, None, None))] + key_states_29 = None + neg_45 = -x2_45 + x2_45 = None + cat_46 = torch.cat((neg_45, x1_45), dim=-1) + neg_45 = x1_45 = None + mul_241 = cat_46 * sin_25 + cat_46 = sin_25 = None + k_embed_22 = mul_240 + mul_241 + mul_240 = mul_241 = None + getitem_184 = k_embed_22[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_22 = None + hidden_states_293 = getitem_184.expand(1, 4, 4, 2, 128) + getitem_184 = None + key_58 = hidden_states_293.reshape(1, 16, 2, 128) + hidden_states_293 = None + getitem_185 = value_states_29[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_29 = None + hidden_states_294 = getitem_185.expand(1, 4, 4, 2, 128) + getitem_185 = None + value_58 = hidden_states_294.reshape(1, 16, 2, 128) + hidden_states_294 = None + attention_mask_30 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_29 = q_embed_22.contiguous() + q_embed_22 = None + key_59 = key_58.contiguous() + key_58 = None + value_59 = value_58.contiguous() + value_58 = None + attn_output_116 = torch._C._nn.scaled_dot_product_attention( + query_29, + key_59, + value_59, + attn_mask=attention_mask_30, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_29 = key_59 = value_59 = attention_mask_30 = None + transpose_120 = attn_output_116.transpose(1, 2) + attn_output_116 = None + attn_output_117 = transpose_120.contiguous() + transpose_120 = None + reshape_89 = attn_output_117.reshape(1, 2, -1) + attn_output_117 = None + attn_output_118 = reshape_89.contiguous() + reshape_89 = None + attn_output_119 = torch._C._nn.linear( + attn_output_118, + l_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_118 = l_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_295 = hidden_states_289 + attn_output_119 + hidden_states_289 = attn_output_119 = None + hidden_states_296 = hidden_states_295.to(torch.float32) + pow_60 = hidden_states_296.pow(2) + variance_59 = pow_60.mean(-1, keepdim=True) + pow_60 = None + add_164 = variance_59 + 1e-06 + variance_59 = None + rsqrt_59 = torch.rsqrt(add_164) + add_164 = None + hidden_states_297 = hidden_states_296 * rsqrt_59 + hidden_states_296 = rsqrt_59 = None + to_123 = hidden_states_297.to(torch.bfloat16) + hidden_states_297 = None + hidden_states_298 = ( + l_self_modules_model_modules_model_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ + * to_123 + ) + l_self_modules_model_modules_model_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ = ( + to_123 + ) = None + linear_207 = torch._C._nn.linear( + hidden_states_298, + l_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_29 = torch.nn.functional.silu(linear_207, inplace=False) + linear_207 = None + linear_208 = torch._C._nn.linear( + hidden_states_298, + l_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_298 = l_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_244 = silu_29 * linear_208 + silu_29 = linear_208 = None + down_proj_29 = torch._C._nn.linear( + mul_244, + l_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_244 = l_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_299 = hidden_states_295 + down_proj_29 + hidden_states_295 = down_proj_29 = None + hidden_states_300 = hidden_states_299.to(torch.float32) + pow_61 = hidden_states_300.pow(2) + variance_60 = pow_61.mean(-1, keepdim=True) + pow_61 = None + add_166 = variance_60 + 1e-06 + variance_60 = None + rsqrt_60 = torch.rsqrt(add_166) + add_166 = None + hidden_states_301 = hidden_states_300 * rsqrt_60 + hidden_states_300 = rsqrt_60 = None + to_125 = hidden_states_301.to(torch.bfloat16) + hidden_states_301 = None + hidden_states_302 = ( + l_self_modules_model_modules_model_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ + * to_125 + ) + l_self_modules_model_modules_model_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ = ( + to_125 + ) = None + linear_210 = torch._C._nn.linear( + hidden_states_302, + l_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_91 = linear_210.view((1, 2, -1, 128)) + linear_210 = None + query_states_30 = view_91.transpose(1, 2) + view_91 = None + linear_211 = torch._C._nn.linear( + hidden_states_302, + l_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_92 = linear_211.view((1, 2, -1, 128)) + linear_211 = None + key_states_30 = view_92.transpose(1, 2) + view_92 = None + linear_212 = torch._C._nn.linear( + hidden_states_302, + l_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_302 = l_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_93 = linear_212.view((1, 2, -1, 128)) + linear_212 = None + value_states_30 = view_93.transpose(1, 2) + view_93 = None + cos_26 = cos_2.unsqueeze(1) + sin_26 = sin_2.unsqueeze(1) + mul_247 = query_states_30 * cos_26 + x1_46 = query_states_30[(Ellipsis, slice(None, 64, None))] + x2_46 = query_states_30[(Ellipsis, slice(64, None, None))] + query_states_30 = None + neg_46 = -x2_46 + x2_46 = None + cat_47 = torch.cat((neg_46, x1_46), dim=-1) + neg_46 = x1_46 = None + mul_248 = cat_47 * sin_26 + cat_47 = None + q_embed_23 = mul_247 + mul_248 + mul_247 = mul_248 = None + mul_249 = key_states_30 * cos_26 + cos_26 = None + x1_47 = key_states_30[(Ellipsis, slice(None, 64, None))] + x2_47 = key_states_30[(Ellipsis, slice(64, None, None))] + key_states_30 = None + neg_47 = -x2_47 + x2_47 = None + cat_48 = torch.cat((neg_47, x1_47), dim=-1) + neg_47 = x1_47 = None + mul_250 = cat_48 * sin_26 + cat_48 = sin_26 = None + k_embed_23 = mul_249 + mul_250 + mul_249 = mul_250 = None + getitem_191 = k_embed_23[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_23 = None + hidden_states_303 = getitem_191.expand(1, 4, 4, 2, 128) + getitem_191 = None + key_60 = hidden_states_303.reshape(1, 16, 2, 128) + hidden_states_303 = None + getitem_192 = value_states_30[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_30 = None + hidden_states_304 = getitem_192.expand(1, 4, 4, 2, 128) + getitem_192 = None + value_60 = hidden_states_304.reshape(1, 16, 2, 128) + hidden_states_304 = None + attention_mask_31 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_30 = q_embed_23.contiguous() + q_embed_23 = None + key_61 = key_60.contiguous() + key_60 = None + value_61 = value_60.contiguous() + value_60 = None + attn_output_120 = torch._C._nn.scaled_dot_product_attention( + query_30, + key_61, + value_61, + attn_mask=attention_mask_31, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_30 = key_61 = value_61 = attention_mask_31 = None + transpose_124 = attn_output_120.transpose(1, 2) + attn_output_120 = None + attn_output_121 = transpose_124.contiguous() + transpose_124 = None + reshape_92 = attn_output_121.reshape(1, 2, -1) + attn_output_121 = None + attn_output_122 = reshape_92.contiguous() + reshape_92 = None + attn_output_123 = torch._C._nn.linear( + attn_output_122, + l_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_122 = l_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_305 = hidden_states_299 + attn_output_123 + hidden_states_299 = attn_output_123 = None + hidden_states_306 = hidden_states_305.to(torch.float32) + pow_62 = hidden_states_306.pow(2) + variance_61 = pow_62.mean(-1, keepdim=True) + pow_62 = None + add_170 = variance_61 + 1e-06 + variance_61 = None + rsqrt_61 = torch.rsqrt(add_170) + add_170 = None + hidden_states_307 = hidden_states_306 * rsqrt_61 + hidden_states_306 = rsqrt_61 = None + to_127 = hidden_states_307.to(torch.bfloat16) + hidden_states_307 = None + hidden_states_308 = ( + l_self_modules_model_modules_model_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ + * to_127 + ) + l_self_modules_model_modules_model_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ = ( + to_127 + ) = None + linear_214 = torch._C._nn.linear( + hidden_states_308, + l_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_30 = torch.nn.functional.silu(linear_214, inplace=False) + linear_214 = None + linear_215 = torch._C._nn.linear( + hidden_states_308, + l_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_308 = l_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_253 = silu_30 * linear_215 + silu_30 = linear_215 = None + down_proj_30 = torch._C._nn.linear( + mul_253, + l_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_253 = l_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_309 = hidden_states_305 + down_proj_30 + hidden_states_305 = down_proj_30 = None + hidden_states_310 = hidden_states_309.to(torch.float32) + pow_63 = hidden_states_310.pow(2) + variance_62 = pow_63.mean(-1, keepdim=True) + pow_63 = None + add_172 = variance_62 + 1e-06 + variance_62 = None + rsqrt_62 = torch.rsqrt(add_172) + add_172 = None + hidden_states_311 = hidden_states_310 * rsqrt_62 + hidden_states_310 = rsqrt_62 = None + to_129 = hidden_states_311.to(torch.bfloat16) + hidden_states_311 = None + hidden_states_312 = ( + l_self_modules_model_modules_model_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ + * to_129 + ) + l_self_modules_model_modules_model_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ = ( + to_129 + ) = None + linear_217 = torch._C._nn.linear( + hidden_states_312, + l_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_94 = linear_217.view((1, 2, -1, 128)) + linear_217 = None + query_states_31 = view_94.transpose(1, 2) + view_94 = None + linear_218 = torch._C._nn.linear( + hidden_states_312, + l_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_95 = linear_218.view((1, 2, -1, 128)) + linear_218 = None + key_states_31 = view_95.transpose(1, 2) + view_95 = None + linear_219 = torch._C._nn.linear( + hidden_states_312, + l_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_312 = l_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_96 = linear_219.view((1, 2, -1, 128)) + linear_219 = None + value_states_31 = view_96.transpose(1, 2) + view_96 = None + getitem_194 = key_states_31[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + key_states_31 = None + hidden_states_313 = getitem_194.expand(1, 4, 4, 2, 128) + getitem_194 = None + key_62 = hidden_states_313.reshape(1, 16, 2, 128) + hidden_states_313 = None + getitem_195 = value_states_31[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_31 = None + hidden_states_314 = getitem_195.expand(1, 4, 4, 2, 128) + getitem_195 = None + value_62 = hidden_states_314.reshape(1, 16, 2, 128) + hidden_states_314 = None + attention_mask_32 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_31 = query_states_31.contiguous() + query_states_31 = None + key_63 = key_62.contiguous() + key_62 = None + value_63 = value_62.contiguous() + value_62 = None + attn_output_124 = torch._C._nn.scaled_dot_product_attention( + query_31, + key_63, + value_63, + attn_mask=attention_mask_32, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_31 = key_63 = value_63 = attention_mask_32 = None + transpose_128 = attn_output_124.transpose(1, 2) + attn_output_124 = None + attn_output_125 = transpose_128.contiguous() + transpose_128 = None + reshape_95 = attn_output_125.reshape(1, 2, -1) + attn_output_125 = None + attn_output_126 = reshape_95.contiguous() + reshape_95 = None + attn_output_127 = torch._C._nn.linear( + attn_output_126, + l_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_126 = l_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_315 = hidden_states_309 + attn_output_127 + hidden_states_309 = attn_output_127 = None + hidden_states_316 = hidden_states_315.to(torch.float32) + pow_64 = hidden_states_316.pow(2) + variance_63 = pow_64.mean(-1, keepdim=True) + pow_64 = None + add_174 = variance_63 + 1e-06 + variance_63 = None + rsqrt_63 = torch.rsqrt(add_174) + add_174 = None + hidden_states_317 = hidden_states_316 * rsqrt_63 + hidden_states_316 = rsqrt_63 = None + to_131 = hidden_states_317.to(torch.bfloat16) + hidden_states_317 = None + hidden_states_318 = ( + l_self_modules_model_modules_model_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ + * to_131 + ) + l_self_modules_model_modules_model_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ = ( + to_131 + ) = None + linear_221 = torch._C._nn.linear( + hidden_states_318, + l_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_31 = torch.nn.functional.silu(linear_221, inplace=False) + linear_221 = None + linear_222 = torch._C._nn.linear( + hidden_states_318, + l_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_318 = l_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_258 = silu_31 * linear_222 + silu_31 = linear_222 = None + down_proj_31 = torch._C._nn.linear( + mul_258, + l_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_258 = l_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_319 = hidden_states_315 + down_proj_31 + hidden_states_315 = down_proj_31 = None + hidden_states_320 = hidden_states_319.to(torch.float32) + pow_65 = hidden_states_320.pow(2) + variance_64 = pow_65.mean(-1, keepdim=True) + pow_65 = None + add_176 = variance_64 + 1e-06 + variance_64 = None + rsqrt_64 = torch.rsqrt(add_176) + add_176 = None + hidden_states_321 = hidden_states_320 * rsqrt_64 + hidden_states_320 = rsqrt_64 = None + to_133 = hidden_states_321.to(torch.bfloat16) + hidden_states_321 = None + hidden_states_322 = ( + l_self_modules_model_modules_model_modules_layers_modules_32_modules_input_layernorm_parameters_weight_ + * to_133 + ) + l_self_modules_model_modules_model_modules_layers_modules_32_modules_input_layernorm_parameters_weight_ = ( + to_133 + ) = None + linear_224 = torch._C._nn.linear( + hidden_states_322, + l_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_97 = linear_224.view((1, 2, -1, 128)) + linear_224 = None + query_states_32 = view_97.transpose(1, 2) + view_97 = None + linear_225 = torch._C._nn.linear( + hidden_states_322, + l_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_98 = linear_225.view((1, 2, -1, 128)) + linear_225 = None + key_states_32 = view_98.transpose(1, 2) + view_98 = None + linear_226 = torch._C._nn.linear( + hidden_states_322, + l_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_322 = l_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_99 = linear_226.view((1, 2, -1, 128)) + linear_226 = None + value_states_32 = view_99.transpose(1, 2) + view_99 = None + cos_27 = cos_2.unsqueeze(1) + sin_27 = sin_2.unsqueeze(1) + mul_261 = query_states_32 * cos_27 + x1_48 = query_states_32[(Ellipsis, slice(None, 64, None))] + x2_48 = query_states_32[(Ellipsis, slice(64, None, None))] + query_states_32 = None + neg_48 = -x2_48 + x2_48 = None + cat_49 = torch.cat((neg_48, x1_48), dim=-1) + neg_48 = x1_48 = None + mul_262 = cat_49 * sin_27 + cat_49 = None + q_embed_24 = mul_261 + mul_262 + mul_261 = mul_262 = None + mul_263 = key_states_32 * cos_27 + cos_27 = None + x1_49 = key_states_32[(Ellipsis, slice(None, 64, None))] + x2_49 = key_states_32[(Ellipsis, slice(64, None, None))] + key_states_32 = None + neg_49 = -x2_49 + x2_49 = None + cat_50 = torch.cat((neg_49, x1_49), dim=-1) + neg_49 = x1_49 = None + mul_264 = cat_50 * sin_27 + cat_50 = sin_27 = None + k_embed_24 = mul_263 + mul_264 + mul_263 = mul_264 = None + getitem_201 = k_embed_24[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_24 = None + hidden_states_323 = getitem_201.expand(1, 4, 4, 2, 128) + getitem_201 = None + key_64 = hidden_states_323.reshape(1, 16, 2, 128) + hidden_states_323 = None + getitem_202 = value_states_32[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_32 = None + hidden_states_324 = getitem_202.expand(1, 4, 4, 2, 128) + getitem_202 = None + value_64 = hidden_states_324.reshape(1, 16, 2, 128) + hidden_states_324 = None + attention_mask_33 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_32 = q_embed_24.contiguous() + q_embed_24 = None + key_65 = key_64.contiguous() + key_64 = None + value_65 = value_64.contiguous() + value_64 = None + attn_output_128 = torch._C._nn.scaled_dot_product_attention( + query_32, + key_65, + value_65, + attn_mask=attention_mask_33, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_32 = key_65 = value_65 = attention_mask_33 = None + transpose_132 = attn_output_128.transpose(1, 2) + attn_output_128 = None + attn_output_129 = transpose_132.contiguous() + transpose_132 = None + reshape_98 = attn_output_129.reshape(1, 2, -1) + attn_output_129 = None + attn_output_130 = reshape_98.contiguous() + reshape_98 = None + attn_output_131 = torch._C._nn.linear( + attn_output_130, + l_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_130 = l_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_325 = hidden_states_319 + attn_output_131 + hidden_states_319 = attn_output_131 = None + hidden_states_326 = hidden_states_325.to(torch.float32) + pow_66 = hidden_states_326.pow(2) + variance_65 = pow_66.mean(-1, keepdim=True) + pow_66 = None + add_180 = variance_65 + 1e-06 + variance_65 = None + rsqrt_65 = torch.rsqrt(add_180) + add_180 = None + hidden_states_327 = hidden_states_326 * rsqrt_65 + hidden_states_326 = rsqrt_65 = None + to_135 = hidden_states_327.to(torch.bfloat16) + hidden_states_327 = None + hidden_states_328 = ( + l_self_modules_model_modules_model_modules_layers_modules_32_modules_post_attention_layernorm_parameters_weight_ + * to_135 + ) + l_self_modules_model_modules_model_modules_layers_modules_32_modules_post_attention_layernorm_parameters_weight_ = ( + to_135 + ) = None + linear_228 = torch._C._nn.linear( + hidden_states_328, + l_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_32 = torch.nn.functional.silu(linear_228, inplace=False) + linear_228 = None + linear_229 = torch._C._nn.linear( + hidden_states_328, + l_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_328 = l_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_267 = silu_32 * linear_229 + silu_32 = linear_229 = None + down_proj_32 = torch._C._nn.linear( + mul_267, + l_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_267 = l_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_329 = hidden_states_325 + down_proj_32 + hidden_states_325 = down_proj_32 = None + hidden_states_330 = hidden_states_329.to(torch.float32) + pow_67 = hidden_states_330.pow(2) + variance_66 = pow_67.mean(-1, keepdim=True) + pow_67 = None + add_182 = variance_66 + 1e-06 + variance_66 = None + rsqrt_66 = torch.rsqrt(add_182) + add_182 = None + hidden_states_331 = hidden_states_330 * rsqrt_66 + hidden_states_330 = rsqrt_66 = None + to_137 = hidden_states_331.to(torch.bfloat16) + hidden_states_331 = None + hidden_states_332 = ( + l_self_modules_model_modules_model_modules_layers_modules_33_modules_input_layernorm_parameters_weight_ + * to_137 + ) + l_self_modules_model_modules_model_modules_layers_modules_33_modules_input_layernorm_parameters_weight_ = ( + to_137 + ) = None + linear_231 = torch._C._nn.linear( + hidden_states_332, + l_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_100 = linear_231.view((1, 2, -1, 128)) + linear_231 = None + query_states_33 = view_100.transpose(1, 2) + view_100 = None + linear_232 = torch._C._nn.linear( + hidden_states_332, + l_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_101 = linear_232.view((1, 2, -1, 128)) + linear_232 = None + key_states_33 = view_101.transpose(1, 2) + view_101 = None + linear_233 = torch._C._nn.linear( + hidden_states_332, + l_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_332 = l_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_102 = linear_233.view((1, 2, -1, 128)) + linear_233 = None + value_states_33 = view_102.transpose(1, 2) + view_102 = None + cos_28 = cos_2.unsqueeze(1) + sin_28 = sin_2.unsqueeze(1) + mul_270 = query_states_33 * cos_28 + x1_50 = query_states_33[(Ellipsis, slice(None, 64, None))] + x2_50 = query_states_33[(Ellipsis, slice(64, None, None))] + query_states_33 = None + neg_50 = -x2_50 + x2_50 = None + cat_51 = torch.cat((neg_50, x1_50), dim=-1) + neg_50 = x1_50 = None + mul_271 = cat_51 * sin_28 + cat_51 = None + q_embed_25 = mul_270 + mul_271 + mul_270 = mul_271 = None + mul_272 = key_states_33 * cos_28 + cos_28 = None + x1_51 = key_states_33[(Ellipsis, slice(None, 64, None))] + x2_51 = key_states_33[(Ellipsis, slice(64, None, None))] + key_states_33 = None + neg_51 = -x2_51 + x2_51 = None + cat_52 = torch.cat((neg_51, x1_51), dim=-1) + neg_51 = x1_51 = None + mul_273 = cat_52 * sin_28 + cat_52 = sin_28 = None + k_embed_25 = mul_272 + mul_273 + mul_272 = mul_273 = None + getitem_208 = k_embed_25[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_25 = None + hidden_states_333 = getitem_208.expand(1, 4, 4, 2, 128) + getitem_208 = None + key_66 = hidden_states_333.reshape(1, 16, 2, 128) + hidden_states_333 = None + getitem_209 = value_states_33[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_33 = None + hidden_states_334 = getitem_209.expand(1, 4, 4, 2, 128) + getitem_209 = None + value_66 = hidden_states_334.reshape(1, 16, 2, 128) + hidden_states_334 = None + attention_mask_34 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_33 = q_embed_25.contiguous() + q_embed_25 = None + key_67 = key_66.contiguous() + key_66 = None + value_67 = value_66.contiguous() + value_66 = None + attn_output_132 = torch._C._nn.scaled_dot_product_attention( + query_33, + key_67, + value_67, + attn_mask=attention_mask_34, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_33 = key_67 = value_67 = attention_mask_34 = None + transpose_136 = attn_output_132.transpose(1, 2) + attn_output_132 = None + attn_output_133 = transpose_136.contiguous() + transpose_136 = None + reshape_101 = attn_output_133.reshape(1, 2, -1) + attn_output_133 = None + attn_output_134 = reshape_101.contiguous() + reshape_101 = None + attn_output_135 = torch._C._nn.linear( + attn_output_134, + l_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_134 = l_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_335 = hidden_states_329 + attn_output_135 + hidden_states_329 = attn_output_135 = None + hidden_states_336 = hidden_states_335.to(torch.float32) + pow_68 = hidden_states_336.pow(2) + variance_67 = pow_68.mean(-1, keepdim=True) + pow_68 = None + add_186 = variance_67 + 1e-06 + variance_67 = None + rsqrt_67 = torch.rsqrt(add_186) + add_186 = None + hidden_states_337 = hidden_states_336 * rsqrt_67 + hidden_states_336 = rsqrt_67 = None + to_139 = hidden_states_337.to(torch.bfloat16) + hidden_states_337 = None + hidden_states_338 = ( + l_self_modules_model_modules_model_modules_layers_modules_33_modules_post_attention_layernorm_parameters_weight_ + * to_139 + ) + l_self_modules_model_modules_model_modules_layers_modules_33_modules_post_attention_layernorm_parameters_weight_ = ( + to_139 + ) = None + linear_235 = torch._C._nn.linear( + hidden_states_338, + l_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_33 = torch.nn.functional.silu(linear_235, inplace=False) + linear_235 = None + linear_236 = torch._C._nn.linear( + hidden_states_338, + l_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_338 = l_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_276 = silu_33 * linear_236 + silu_33 = linear_236 = None + down_proj_33 = torch._C._nn.linear( + mul_276, + l_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_276 = l_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_339 = hidden_states_335 + down_proj_33 + hidden_states_335 = down_proj_33 = None + hidden_states_340 = hidden_states_339.to(torch.float32) + pow_69 = hidden_states_340.pow(2) + variance_68 = pow_69.mean(-1, keepdim=True) + pow_69 = None + add_188 = variance_68 + 1e-06 + variance_68 = None + rsqrt_68 = torch.rsqrt(add_188) + add_188 = None + hidden_states_341 = hidden_states_340 * rsqrt_68 + hidden_states_340 = rsqrt_68 = None + to_141 = hidden_states_341.to(torch.bfloat16) + hidden_states_341 = None + hidden_states_342 = ( + l_self_modules_model_modules_model_modules_layers_modules_34_modules_input_layernorm_parameters_weight_ + * to_141 + ) + l_self_modules_model_modules_model_modules_layers_modules_34_modules_input_layernorm_parameters_weight_ = ( + to_141 + ) = None + linear_238 = torch._C._nn.linear( + hidden_states_342, + l_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_103 = linear_238.view((1, 2, -1, 128)) + linear_238 = None + query_states_34 = view_103.transpose(1, 2) + view_103 = None + linear_239 = torch._C._nn.linear( + hidden_states_342, + l_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_104 = linear_239.view((1, 2, -1, 128)) + linear_239 = None + key_states_34 = view_104.transpose(1, 2) + view_104 = None + linear_240 = torch._C._nn.linear( + hidden_states_342, + l_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_342 = l_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_105 = linear_240.view((1, 2, -1, 128)) + linear_240 = None + value_states_34 = view_105.transpose(1, 2) + view_105 = None + cos_29 = cos_2.unsqueeze(1) + cos_2 = None + sin_29 = sin_2.unsqueeze(1) + sin_2 = None + mul_279 = query_states_34 * cos_29 + x1_52 = query_states_34[(Ellipsis, slice(None, 64, None))] + x2_52 = query_states_34[(Ellipsis, slice(64, None, None))] + query_states_34 = None + neg_52 = -x2_52 + x2_52 = None + cat_53 = torch.cat((neg_52, x1_52), dim=-1) + neg_52 = x1_52 = None + mul_280 = cat_53 * sin_29 + cat_53 = None + q_embed_26 = mul_279 + mul_280 + mul_279 = mul_280 = None + mul_281 = key_states_34 * cos_29 + cos_29 = None + x1_53 = key_states_34[(Ellipsis, slice(None, 64, None))] + x2_53 = key_states_34[(Ellipsis, slice(64, None, None))] + key_states_34 = None + neg_53 = -x2_53 + x2_53 = None + cat_54 = torch.cat((neg_53, x1_53), dim=-1) + neg_53 = x1_53 = None + mul_282 = cat_54 * sin_29 + cat_54 = sin_29 = None + k_embed_26 = mul_281 + mul_282 + mul_281 = mul_282 = None + getitem_215 = k_embed_26[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_26 = None + hidden_states_343 = getitem_215.expand(1, 4, 4, 2, 128) + getitem_215 = None + key_68 = hidden_states_343.reshape(1, 16, 2, 128) + hidden_states_343 = None + getitem_216 = value_states_34[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_34 = None + hidden_states_344 = getitem_216.expand(1, 4, 4, 2, 128) + getitem_216 = None + value_68 = hidden_states_344.reshape(1, 16, 2, 128) + hidden_states_344 = None + attention_mask_35 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + query_34 = q_embed_26.contiguous() + q_embed_26 = None + key_69 = key_68.contiguous() + key_68 = None + value_69 = value_68.contiguous() + value_68 = None + attn_output_136 = torch._C._nn.scaled_dot_product_attention( + query_34, + key_69, + value_69, + attn_mask=attention_mask_35, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_34 = key_69 = value_69 = attention_mask_35 = None + transpose_140 = attn_output_136.transpose(1, 2) + attn_output_136 = None + attn_output_137 = transpose_140.contiguous() + transpose_140 = None + reshape_104 = attn_output_137.reshape(1, 2, -1) + attn_output_137 = None + attn_output_138 = reshape_104.contiguous() + reshape_104 = None + attn_output_139 = torch._C._nn.linear( + attn_output_138, + l_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_138 = l_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_345 = hidden_states_339 + attn_output_139 + hidden_states_339 = attn_output_139 = None + hidden_states_346 = hidden_states_345.to(torch.float32) + pow_70 = hidden_states_346.pow(2) + variance_69 = pow_70.mean(-1, keepdim=True) + pow_70 = None + add_192 = variance_69 + 1e-06 + variance_69 = None + rsqrt_69 = torch.rsqrt(add_192) + add_192 = None + hidden_states_347 = hidden_states_346 * rsqrt_69 + hidden_states_346 = rsqrt_69 = None + to_143 = hidden_states_347.to(torch.bfloat16) + hidden_states_347 = None + hidden_states_348 = ( + l_self_modules_model_modules_model_modules_layers_modules_34_modules_post_attention_layernorm_parameters_weight_ + * to_143 + ) + l_self_modules_model_modules_model_modules_layers_modules_34_modules_post_attention_layernorm_parameters_weight_ = ( + to_143 + ) = None + linear_242 = torch._C._nn.linear( + hidden_states_348, + l_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_34 = torch.nn.functional.silu(linear_242, inplace=False) + linear_242 = None + linear_243 = torch._C._nn.linear( + hidden_states_348, + l_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_348 = l_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_285 = silu_34 * linear_243 + silu_34 = linear_243 = None + down_proj_34 = torch._C._nn.linear( + mul_285, + l_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_285 = l_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_349 = hidden_states_345 + down_proj_34 + hidden_states_345 = down_proj_34 = None + hidden_states_350 = hidden_states_349.to(torch.float32) + pow_71 = hidden_states_350.pow(2) + variance_70 = pow_71.mean(-1, keepdim=True) + pow_71 = None + add_194 = variance_70 + 1e-06 + variance_70 = None + rsqrt_70 = torch.rsqrt(add_194) + add_194 = None + hidden_states_351 = hidden_states_350 * rsqrt_70 + hidden_states_350 = rsqrt_70 = None + to_145 = hidden_states_351.to(torch.bfloat16) + hidden_states_351 = None + hidden_states_352 = ( + l_self_modules_model_modules_model_modules_layers_modules_35_modules_input_layernorm_parameters_weight_ + * to_145 + ) + l_self_modules_model_modules_model_modules_layers_modules_35_modules_input_layernorm_parameters_weight_ = ( + to_145 + ) = None + linear_245 = torch._C._nn.linear( + hidden_states_352, + l_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_106 = linear_245.view((1, 2, -1, 128)) + linear_245 = None + query_states_35 = view_106.transpose(1, 2) + view_106 = None + linear_246 = torch._C._nn.linear( + hidden_states_352, + l_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_107 = linear_246.view((1, 2, -1, 128)) + linear_246 = None + key_states_35 = view_107.transpose(1, 2) + view_107 = None + linear_247 = torch._C._nn.linear( + hidden_states_352, + l_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_352 = l_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_108 = linear_247.view((1, 2, -1, 128)) + linear_247 = None + value_states_35 = view_108.transpose(1, 2) + view_108 = None + getitem_218 = key_states_35[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + key_states_35 = None + hidden_states_353 = getitem_218.expand(1, 4, 4, 2, 128) + getitem_218 = None + key_70 = hidden_states_353.reshape(1, 16, 2, 128) + hidden_states_353 = None + getitem_219 = value_states_35[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_35 = None + hidden_states_354 = getitem_219.expand(1, 4, 4, 2, 128) + getitem_219 = None + value_70 = hidden_states_354.reshape(1, 16, 2, 128) + hidden_states_354 = None + attention_mask_36 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + causal_mask_2 = None + query_35 = query_states_35.contiguous() + query_states_35 = None + key_71 = key_70.contiguous() + key_70 = None + value_71 = value_70.contiguous() + value_70 = None + attn_output_140 = torch._C._nn.scaled_dot_product_attention( + query_35, + key_71, + value_71, + attn_mask=attention_mask_36, + dropout_p=0.0, + scale=0.08838834764831845, + is_causal=False, + ) + query_35 = key_71 = value_71 = attention_mask_36 = None + transpose_144 = attn_output_140.transpose(1, 2) + attn_output_140 = None + attn_output_141 = transpose_144.contiguous() + transpose_144 = None + reshape_107 = attn_output_141.reshape(1, 2, -1) + attn_output_141 = None + attn_output_142 = reshape_107.contiguous() + reshape_107 = None + attn_output_143 = torch._C._nn.linear( + attn_output_142, + l_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_142 = l_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_355 = hidden_states_349 + attn_output_143 + hidden_states_349 = attn_output_143 = None + hidden_states_356 = hidden_states_355.to(torch.float32) + pow_72 = hidden_states_356.pow(2) + variance_71 = pow_72.mean(-1, keepdim=True) + pow_72 = None + add_196 = variance_71 + 1e-06 + variance_71 = None + rsqrt_71 = torch.rsqrt(add_196) + add_196 = None + hidden_states_357 = hidden_states_356 * rsqrt_71 + hidden_states_356 = rsqrt_71 = None + to_147 = hidden_states_357.to(torch.bfloat16) + hidden_states_357 = None + hidden_states_358 = ( + l_self_modules_model_modules_model_modules_layers_modules_35_modules_post_attention_layernorm_parameters_weight_ + * to_147 + ) + l_self_modules_model_modules_model_modules_layers_modules_35_modules_post_attention_layernorm_parameters_weight_ = ( + to_147 + ) = None + linear_249 = torch._C._nn.linear( + hidden_states_358, + l_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_35 = torch.nn.functional.silu(linear_249, inplace=False) + linear_249 = None + linear_250 = torch._C._nn.linear( + hidden_states_358, + l_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_358 = l_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_290 = silu_35 * linear_250 + silu_35 = linear_250 = None + down_proj_35 = torch._C._nn.linear( + mul_290, + l_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_290 = l_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_359 = hidden_states_355 + down_proj_35 + hidden_states_355 = down_proj_35 = None + hidden_states_360 = hidden_states_359.to(torch.float32) + hidden_states_359 = None + pow_73 = hidden_states_360.pow(2) + variance_72 = pow_73.mean(-1, keepdim=True) + pow_73 = None + add_198 = variance_72 + 1e-06 + variance_72 = None + rsqrt_72 = torch.rsqrt(add_198) + add_198 = None + hidden_states_361 = hidden_states_360 * rsqrt_72 + hidden_states_360 = rsqrt_72 = None + to_149 = hidden_states_361.to(torch.bfloat16) + hidden_states_361 = None + hidden_states_362 = ( + l_self_modules_model_modules_model_modules_norm_parameters_weight_ * to_149 + ) + l_self_modules_model_modules_model_modules_norm_parameters_weight_ = ( + to_149 + ) = None + getitem_221 = hidden_states_362[ + (slice(None, None, None), slice(0, None, None), slice(None, None, None)) + ] + hidden_states_362 = None + logits = torch._C._nn.linear( + getitem_221, + l_self_modules_model_modules_model_modules_embed_tokens_parameters_weight_, + None, + ) + getitem_221 = ( + l_self_modules_model_modules_model_modules_embed_tokens_parameters_weight_ + ) = None + return (logits,) diff --git a/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/weight_meta.py b/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/weight_meta.py new file mode 100644 index 000000000..ddb33526c --- /dev/null +++ b/samples/transformers-auto-model/HuggingFaceTB/SmolLM3-3B/weight_meta.py @@ -0,0 +1,3288 @@ +class Program_weight_tensor_meta_L_kwargs_input_ids_: + name = "L_kwargs_input_ids_" + shape = [1, 2] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [9906, 1917] + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_embed_tokens_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_embed_tokens_parameters_weight_" + shape = [128256, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_kwargs_attention_mask_: + name = "L_kwargs_attention_mask_" + shape = [1, 2] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1] + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_rotary_emb_buffers_inv_freq_: + name = "L_self_modules_model_modules_model_modules_rotary_emb_buffers_inv_freq_" + shape = [64] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.073 + std = 0.190 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_22_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_23_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_24_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_24_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_25_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_25_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_26_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_26_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_27_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_27_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_28_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_28_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_29_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_29_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_30_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_30_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_31_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_31_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_32_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_32_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_32_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_32_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_32_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_32_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_33_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_33_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_33_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_33_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_33_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_33_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_34_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_34_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_34_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_34_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_34_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_34_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_35_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_35_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [512, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_35_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_35_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_35_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_up_proj_parameters_weight_" + shape = [11008, 2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_layers_modules_35_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 11008] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_model_modules_norm_parameters_weight_: + name = "L_self_modules_model_modules_model_modules_norm_parameters_weight_" + shape = [2048] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/graph_hash.txt b/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/graph_hash.txt new file mode 100644 index 000000000..230bb15c2 --- /dev/null +++ b/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/graph_hash.txt @@ -0,0 +1 @@ +9615abe0dd7b999472a85cc937d9d93b8c1c6218b88174775993a65f32de64e0 \ No newline at end of file diff --git a/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/graph_net.json b/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/input_meta.py b/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/input_tensor_constraints.py b/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/model.py b/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/model.py new file mode 100644 index 000000000..19e36aa32 --- /dev/null +++ b/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/model.py @@ -0,0 +1,4773 @@ +import torch + +from torch import device + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_kwargs_input_ids_: torch.Tensor, + L_self_modules_model_modules_embed_tokens_parameters_weight_: torch.nn.parameter.Parameter, + L_kwargs_attention_mask_: torch.Tensor, + L_self_modules_model_modules_rotary_emb_buffers_inv_freq_: torch.Tensor, + L_self_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_norm_parameters_weight_: torch.nn.parameter.Parameter, + ): + l_kwargs_input_ids_ = L_kwargs_input_ids_ + l_self_modules_model_modules_embed_tokens_parameters_weight_ = ( + L_self_modules_model_modules_embed_tokens_parameters_weight_ + ) + l_kwargs_attention_mask_ = L_kwargs_attention_mask_ + l_self_modules_model_modules_rotary_emb_buffers_inv_freq_ = ( + L_self_modules_model_modules_rotary_emb_buffers_inv_freq_ + ) + l_self_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_norm_parameters_weight_ = ( + L_self_modules_model_modules_norm_parameters_weight_ + ) + inputs_embeds = torch.nn.functional.embedding( + l_kwargs_input_ids_, + l_self_modules_model_modules_embed_tokens_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + l_kwargs_input_ids_ = ( + l_self_modules_model_modules_embed_tokens_parameters_weight_ + ) = None + cache_position = torch.arange(0, 3, device=device(type="cuda", index=0)) + position_ids = cache_position.unsqueeze(0) + attention_mask = l_kwargs_attention_mask_.to( + device=device(type="cuda", index=0), dtype=torch.bool + ) + l_kwargs_attention_mask_ = None + mask_indices = torch.arange(3, device=device(type="cuda", index=0)) + mask_indices += 0 + mask_indices_1 = mask_indices + mask_indices = None + local_padding_mask = attention_mask[(slice(None, None, None), mask_indices_1)] + attention_mask = mask_indices_1 = None + kv_arange = torch.arange(3, device=device(type="cuda", index=0)) + kv_arange += 0 + kv_arange_1 = kv_arange + kv_arange = None + reshaped_cache_position = cache_position.view(-1, 1) + cache_position = None + causal_mask = kv_arange_1 <= reshaped_cache_position + kv_arange_1 = reshaped_cache_position = None + getitem_1 = causal_mask[ + (None, None, slice(None, None, None), slice(None, None, None)) + ] + causal_mask = None + causal_mask_1 = getitem_1.expand(1, -1, -1, -1) + getitem_1 = None + getitem_2 = local_padding_mask[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + local_padding_mask = None + causal_mask_2 = causal_mask_1 * getitem_2 + causal_mask_1 = getitem_2 = None + _set_grad_enabled = torch._C._set_grad_enabled(False) + _set_grad_enabled = None + getitem_3 = l_self_modules_model_modules_rotary_emb_buffers_inv_freq_[ + (None, slice(None, None, None), None) + ] + l_self_modules_model_modules_rotary_emb_buffers_inv_freq_ = None + float_1 = getitem_3.float() + getitem_3 = None + expand_1 = float_1.expand(1, -1, 1) + float_1 = None + inv_freq_expanded = expand_1.to(device(type="cuda", index=0)) + expand_1 = None + getitem_4 = position_ids[ + (slice(None, None, None), None, slice(None, None, None)) + ] + position_ids = None + position_ids_expanded = getitem_4.float() + getitem_4 = None + float_3 = inv_freq_expanded.float() + inv_freq_expanded = None + float_4 = position_ids_expanded.float() + position_ids_expanded = None + matmul = float_3 @ float_4 + float_3 = float_4 = None + freqs = matmul.transpose(1, 2) + matmul = None + emb = torch.cat((freqs, freqs), dim=-1) + freqs = None + cos = emb.cos() + cos_1 = cos * 1.0 + cos = None + sin = emb.sin() + emb = None + sin_1 = sin * 1.0 + sin = None + cos_2 = cos_1.to(dtype=torch.float32) + cos_1 = None + sin_2 = sin_1.to(dtype=torch.float32) + sin_1 = None + _set_grad_enabled_1 = torch._C._set_grad_enabled(True) + _set_grad_enabled_1 = None + _log_api_usage_once = torch._C._log_api_usage_once("python.nn_module") + _log_api_usage_once = None + hidden_states = inputs_embeds.to(torch.float32) + pow_1 = hidden_states.pow(2) + variance = pow_1.mean(-1, keepdim=True) + pow_1 = None + add = variance + 1e-05 + variance = None + rsqrt = torch.rsqrt(add) + add = None + hidden_states_1 = hidden_states * rsqrt + hidden_states = rsqrt = None + to_5 = hidden_states_1.to(torch.float32) + hidden_states_1 = None + hidden_states_2 = ( + l_self_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + * to_5 + ) + l_self_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + to_5 + ) = None + linear = torch._C._nn.linear( + hidden_states_2, + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_1 = linear.view((1, 3, -1, 64)) + linear = None + query_states = view_1.transpose(1, 2) + view_1 = None + linear_1 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_2 = linear_1.view((1, 3, -1, 64)) + linear_1 = None + key_states = view_2.transpose(1, 2) + view_2 = None + linear_2 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_2 = l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_3 = linear_2.view((1, 3, -1, 64)) + linear_2 = None + value_states = view_3.transpose(1, 2) + view_3 = None + cos_3 = cos_2.unsqueeze(1) + sin_3 = sin_2.unsqueeze(1) + mul_5 = query_states * cos_3 + x1 = query_states[(Ellipsis, slice(None, 32, None))] + x2 = query_states[(Ellipsis, slice(32, None, None))] + query_states = None + neg = -x2 + x2 = None + cat_1 = torch.cat((neg, x1), dim=-1) + neg = x1 = None + mul_6 = cat_1 * sin_3 + cat_1 = None + q_embed = mul_5 + mul_6 + mul_5 = mul_6 = None + mul_7 = key_states * cos_3 + cos_3 = None + x1_1 = key_states[(Ellipsis, slice(None, 32, None))] + x2_1 = key_states[(Ellipsis, slice(32, None, None))] + key_states = None + neg_1 = -x2_1 + x2_1 = None + cat_2 = torch.cat((neg_1, x1_1), dim=-1) + neg_1 = x1_1 = None + mul_8 = cat_2 * sin_3 + cat_2 = sin_3 = None + k_embed = mul_7 + mul_8 + mul_7 = mul_8 = None + getitem_9 = k_embed[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed = None + hidden_states_3 = getitem_9.expand(1, 4, 8, 3, 64) + getitem_9 = None + key = hidden_states_3.reshape(1, 32, 3, 64) + hidden_states_3 = None + getitem_10 = value_states[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states = None + hidden_states_4 = getitem_10.expand(1, 4, 8, 3, 64) + getitem_10 = None + value = hidden_states_4.reshape(1, 32, 3, 64) + hidden_states_4 = None + attention_mask_1 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query = q_embed.contiguous() + q_embed = None + key_1 = key.contiguous() + key = None + value_1 = value.contiguous() + value = None + attn_output = torch._C._nn.scaled_dot_product_attention( + query, + key_1, + value_1, + attn_mask=attention_mask_1, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query = key_1 = value_1 = attention_mask_1 = None + transpose_4 = attn_output.transpose(1, 2) + attn_output = None + attn_output_1 = transpose_4.contiguous() + transpose_4 = None + reshape_2 = attn_output_1.reshape(1, 3, -1) + attn_output_1 = None + attn_output_2 = reshape_2.contiguous() + reshape_2 = None + attn_output_3 = torch._C._nn.linear( + attn_output_2, + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_2 = l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_5 = inputs_embeds + attn_output_3 + inputs_embeds = attn_output_3 = None + hidden_states_6 = hidden_states_5.to(torch.float32) + pow_2 = hidden_states_6.pow(2) + variance_1 = pow_2.mean(-1, keepdim=True) + pow_2 = None + add_4 = variance_1 + 1e-05 + variance_1 = None + rsqrt_1 = torch.rsqrt(add_4) + add_4 = None + hidden_states_7 = hidden_states_6 * rsqrt_1 + hidden_states_6 = rsqrt_1 = None + to_7 = hidden_states_7.to(torch.float32) + hidden_states_7 = None + hidden_states_8 = ( + l_self_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + * to_7 + ) + l_self_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = ( + to_7 + ) = None + linear_4 = torch._C._nn.linear( + hidden_states_8, + l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu = torch.nn.functional.silu(linear_4, inplace=False) + linear_4 = None + linear_5 = torch._C._nn.linear( + hidden_states_8, + l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_8 = l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_11 = silu * linear_5 + silu = linear_5 = None + down_proj = torch._C._nn.linear( + mul_11, + l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_11 = l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_9 = hidden_states_5 + down_proj + hidden_states_5 = down_proj = None + hidden_states_10 = hidden_states_9.to(torch.float32) + pow_3 = hidden_states_10.pow(2) + variance_2 = pow_3.mean(-1, keepdim=True) + pow_3 = None + add_6 = variance_2 + 1e-05 + variance_2 = None + rsqrt_2 = torch.rsqrt(add_6) + add_6 = None + hidden_states_11 = hidden_states_10 * rsqrt_2 + hidden_states_10 = rsqrt_2 = None + to_9 = hidden_states_11.to(torch.float32) + hidden_states_11 = None + hidden_states_12 = ( + l_self_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + * to_9 + ) + l_self_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + to_9 + ) = None + linear_7 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_4 = linear_7.view((1, 3, -1, 64)) + linear_7 = None + query_states_1 = view_4.transpose(1, 2) + view_4 = None + linear_8 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_5 = linear_8.view((1, 3, -1, 64)) + linear_8 = None + key_states_1 = view_5.transpose(1, 2) + view_5 = None + linear_9 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_12 = l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_6 = linear_9.view((1, 3, -1, 64)) + linear_9 = None + value_states_1 = view_6.transpose(1, 2) + view_6 = None + cos_4 = cos_2.unsqueeze(1) + sin_4 = sin_2.unsqueeze(1) + mul_14 = query_states_1 * cos_4 + x1_2 = query_states_1[(Ellipsis, slice(None, 32, None))] + x2_2 = query_states_1[(Ellipsis, slice(32, None, None))] + query_states_1 = None + neg_2 = -x2_2 + x2_2 = None + cat_3 = torch.cat((neg_2, x1_2), dim=-1) + neg_2 = x1_2 = None + mul_15 = cat_3 * sin_4 + cat_3 = None + q_embed_1 = mul_14 + mul_15 + mul_14 = mul_15 = None + mul_16 = key_states_1 * cos_4 + cos_4 = None + x1_3 = key_states_1[(Ellipsis, slice(None, 32, None))] + x2_3 = key_states_1[(Ellipsis, slice(32, None, None))] + key_states_1 = None + neg_3 = -x2_3 + x2_3 = None + cat_4 = torch.cat((neg_3, x1_3), dim=-1) + neg_3 = x1_3 = None + mul_17 = cat_4 * sin_4 + cat_4 = sin_4 = None + k_embed_1 = mul_16 + mul_17 + mul_16 = mul_17 = None + getitem_16 = k_embed_1[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_1 = None + hidden_states_13 = getitem_16.expand(1, 4, 8, 3, 64) + getitem_16 = None + key_2 = hidden_states_13.reshape(1, 32, 3, 64) + hidden_states_13 = None + getitem_17 = value_states_1[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_1 = None + hidden_states_14 = getitem_17.expand(1, 4, 8, 3, 64) + getitem_17 = None + value_2 = hidden_states_14.reshape(1, 32, 3, 64) + hidden_states_14 = None + attention_mask_2 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_1 = q_embed_1.contiguous() + q_embed_1 = None + key_3 = key_2.contiguous() + key_2 = None + value_3 = value_2.contiguous() + value_2 = None + attn_output_4 = torch._C._nn.scaled_dot_product_attention( + query_1, + key_3, + value_3, + attn_mask=attention_mask_2, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_1 = key_3 = value_3 = attention_mask_2 = None + transpose_8 = attn_output_4.transpose(1, 2) + attn_output_4 = None + attn_output_5 = transpose_8.contiguous() + transpose_8 = None + reshape_5 = attn_output_5.reshape(1, 3, -1) + attn_output_5 = None + attn_output_6 = reshape_5.contiguous() + reshape_5 = None + attn_output_7 = torch._C._nn.linear( + attn_output_6, + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_6 = l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_15 = hidden_states_9 + attn_output_7 + hidden_states_9 = attn_output_7 = None + hidden_states_16 = hidden_states_15.to(torch.float32) + pow_4 = hidden_states_16.pow(2) + variance_3 = pow_4.mean(-1, keepdim=True) + pow_4 = None + add_10 = variance_3 + 1e-05 + variance_3 = None + rsqrt_3 = torch.rsqrt(add_10) + add_10 = None + hidden_states_17 = hidden_states_16 * rsqrt_3 + hidden_states_16 = rsqrt_3 = None + to_11 = hidden_states_17.to(torch.float32) + hidden_states_17 = None + hidden_states_18 = ( + l_self_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + * to_11 + ) + l_self_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = ( + to_11 + ) = None + linear_11 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_1 = torch.nn.functional.silu(linear_11, inplace=False) + linear_11 = None + linear_12 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_18 = l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_20 = silu_1 * linear_12 + silu_1 = linear_12 = None + down_proj_1 = torch._C._nn.linear( + mul_20, + l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_20 = l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_19 = hidden_states_15 + down_proj_1 + hidden_states_15 = down_proj_1 = None + hidden_states_20 = hidden_states_19.to(torch.float32) + pow_5 = hidden_states_20.pow(2) + variance_4 = pow_5.mean(-1, keepdim=True) + pow_5 = None + add_12 = variance_4 + 1e-05 + variance_4 = None + rsqrt_4 = torch.rsqrt(add_12) + add_12 = None + hidden_states_21 = hidden_states_20 * rsqrt_4 + hidden_states_20 = rsqrt_4 = None + to_13 = hidden_states_21.to(torch.float32) + hidden_states_21 = None + hidden_states_22 = ( + l_self_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + * to_13 + ) + l_self_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + to_13 + ) = None + linear_14 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_7 = linear_14.view((1, 3, -1, 64)) + linear_14 = None + query_states_2 = view_7.transpose(1, 2) + view_7 = None + linear_15 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_8 = linear_15.view((1, 3, -1, 64)) + linear_15 = None + key_states_2 = view_8.transpose(1, 2) + view_8 = None + linear_16 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_22 = l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_9 = linear_16.view((1, 3, -1, 64)) + linear_16 = None + value_states_2 = view_9.transpose(1, 2) + view_9 = None + cos_5 = cos_2.unsqueeze(1) + sin_5 = sin_2.unsqueeze(1) + mul_23 = query_states_2 * cos_5 + x1_4 = query_states_2[(Ellipsis, slice(None, 32, None))] + x2_4 = query_states_2[(Ellipsis, slice(32, None, None))] + query_states_2 = None + neg_4 = -x2_4 + x2_4 = None + cat_5 = torch.cat((neg_4, x1_4), dim=-1) + neg_4 = x1_4 = None + mul_24 = cat_5 * sin_5 + cat_5 = None + q_embed_2 = mul_23 + mul_24 + mul_23 = mul_24 = None + mul_25 = key_states_2 * cos_5 + cos_5 = None + x1_5 = key_states_2[(Ellipsis, slice(None, 32, None))] + x2_5 = key_states_2[(Ellipsis, slice(32, None, None))] + key_states_2 = None + neg_5 = -x2_5 + x2_5 = None + cat_6 = torch.cat((neg_5, x1_5), dim=-1) + neg_5 = x1_5 = None + mul_26 = cat_6 * sin_5 + cat_6 = sin_5 = None + k_embed_2 = mul_25 + mul_26 + mul_25 = mul_26 = None + getitem_23 = k_embed_2[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_2 = None + hidden_states_23 = getitem_23.expand(1, 4, 8, 3, 64) + getitem_23 = None + key_4 = hidden_states_23.reshape(1, 32, 3, 64) + hidden_states_23 = None + getitem_24 = value_states_2[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_2 = None + hidden_states_24 = getitem_24.expand(1, 4, 8, 3, 64) + getitem_24 = None + value_4 = hidden_states_24.reshape(1, 32, 3, 64) + hidden_states_24 = None + attention_mask_3 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_2 = q_embed_2.contiguous() + q_embed_2 = None + key_5 = key_4.contiguous() + key_4 = None + value_5 = value_4.contiguous() + value_4 = None + attn_output_8 = torch._C._nn.scaled_dot_product_attention( + query_2, + key_5, + value_5, + attn_mask=attention_mask_3, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_2 = key_5 = value_5 = attention_mask_3 = None + transpose_12 = attn_output_8.transpose(1, 2) + attn_output_8 = None + attn_output_9 = transpose_12.contiguous() + transpose_12 = None + reshape_8 = attn_output_9.reshape(1, 3, -1) + attn_output_9 = None + attn_output_10 = reshape_8.contiguous() + reshape_8 = None + attn_output_11 = torch._C._nn.linear( + attn_output_10, + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_10 = l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_25 = hidden_states_19 + attn_output_11 + hidden_states_19 = attn_output_11 = None + hidden_states_26 = hidden_states_25.to(torch.float32) + pow_6 = hidden_states_26.pow(2) + variance_5 = pow_6.mean(-1, keepdim=True) + pow_6 = None + add_16 = variance_5 + 1e-05 + variance_5 = None + rsqrt_5 = torch.rsqrt(add_16) + add_16 = None + hidden_states_27 = hidden_states_26 * rsqrt_5 + hidden_states_26 = rsqrt_5 = None + to_15 = hidden_states_27.to(torch.float32) + hidden_states_27 = None + hidden_states_28 = ( + l_self_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + * to_15 + ) + l_self_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = ( + to_15 + ) = None + linear_18 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_2 = torch.nn.functional.silu(linear_18, inplace=False) + linear_18 = None + linear_19 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_28 = l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_29 = silu_2 * linear_19 + silu_2 = linear_19 = None + down_proj_2 = torch._C._nn.linear( + mul_29, + l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_29 = l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_29 = hidden_states_25 + down_proj_2 + hidden_states_25 = down_proj_2 = None + hidden_states_30 = hidden_states_29.to(torch.float32) + pow_7 = hidden_states_30.pow(2) + variance_6 = pow_7.mean(-1, keepdim=True) + pow_7 = None + add_18 = variance_6 + 1e-05 + variance_6 = None + rsqrt_6 = torch.rsqrt(add_18) + add_18 = None + hidden_states_31 = hidden_states_30 * rsqrt_6 + hidden_states_30 = rsqrt_6 = None + to_17 = hidden_states_31.to(torch.float32) + hidden_states_31 = None + hidden_states_32 = ( + l_self_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + * to_17 + ) + l_self_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + to_17 + ) = None + linear_21 = torch._C._nn.linear( + hidden_states_32, + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_10 = linear_21.view((1, 3, -1, 64)) + linear_21 = None + query_states_3 = view_10.transpose(1, 2) + view_10 = None + linear_22 = torch._C._nn.linear( + hidden_states_32, + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_11 = linear_22.view((1, 3, -1, 64)) + linear_22 = None + key_states_3 = view_11.transpose(1, 2) + view_11 = None + linear_23 = torch._C._nn.linear( + hidden_states_32, + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_32 = l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_12 = linear_23.view((1, 3, -1, 64)) + linear_23 = None + value_states_3 = view_12.transpose(1, 2) + view_12 = None + cos_6 = cos_2.unsqueeze(1) + sin_6 = sin_2.unsqueeze(1) + mul_32 = query_states_3 * cos_6 + x1_6 = query_states_3[(Ellipsis, slice(None, 32, None))] + x2_6 = query_states_3[(Ellipsis, slice(32, None, None))] + query_states_3 = None + neg_6 = -x2_6 + x2_6 = None + cat_7 = torch.cat((neg_6, x1_6), dim=-1) + neg_6 = x1_6 = None + mul_33 = cat_7 * sin_6 + cat_7 = None + q_embed_3 = mul_32 + mul_33 + mul_32 = mul_33 = None + mul_34 = key_states_3 * cos_6 + cos_6 = None + x1_7 = key_states_3[(Ellipsis, slice(None, 32, None))] + x2_7 = key_states_3[(Ellipsis, slice(32, None, None))] + key_states_3 = None + neg_7 = -x2_7 + x2_7 = None + cat_8 = torch.cat((neg_7, x1_7), dim=-1) + neg_7 = x1_7 = None + mul_35 = cat_8 * sin_6 + cat_8 = sin_6 = None + k_embed_3 = mul_34 + mul_35 + mul_34 = mul_35 = None + getitem_30 = k_embed_3[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_3 = None + hidden_states_33 = getitem_30.expand(1, 4, 8, 3, 64) + getitem_30 = None + key_6 = hidden_states_33.reshape(1, 32, 3, 64) + hidden_states_33 = None + getitem_31 = value_states_3[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_3 = None + hidden_states_34 = getitem_31.expand(1, 4, 8, 3, 64) + getitem_31 = None + value_6 = hidden_states_34.reshape(1, 32, 3, 64) + hidden_states_34 = None + attention_mask_4 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_3 = q_embed_3.contiguous() + q_embed_3 = None + key_7 = key_6.contiguous() + key_6 = None + value_7 = value_6.contiguous() + value_6 = None + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_3, + key_7, + value_7, + attn_mask=attention_mask_4, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_3 = key_7 = value_7 = attention_mask_4 = None + transpose_16 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_13 = transpose_16.contiguous() + transpose_16 = None + reshape_11 = attn_output_13.reshape(1, 3, -1) + attn_output_13 = None + attn_output_14 = reshape_11.contiguous() + reshape_11 = None + attn_output_15 = torch._C._nn.linear( + attn_output_14, + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_14 = l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_35 = hidden_states_29 + attn_output_15 + hidden_states_29 = attn_output_15 = None + hidden_states_36 = hidden_states_35.to(torch.float32) + pow_8 = hidden_states_36.pow(2) + variance_7 = pow_8.mean(-1, keepdim=True) + pow_8 = None + add_22 = variance_7 + 1e-05 + variance_7 = None + rsqrt_7 = torch.rsqrt(add_22) + add_22 = None + hidden_states_37 = hidden_states_36 * rsqrt_7 + hidden_states_36 = rsqrt_7 = None + to_19 = hidden_states_37.to(torch.float32) + hidden_states_37 = None + hidden_states_38 = ( + l_self_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + * to_19 + ) + l_self_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = ( + to_19 + ) = None + linear_25 = torch._C._nn.linear( + hidden_states_38, + l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_3 = torch.nn.functional.silu(linear_25, inplace=False) + linear_25 = None + linear_26 = torch._C._nn.linear( + hidden_states_38, + l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_38 = l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_38 = silu_3 * linear_26 + silu_3 = linear_26 = None + down_proj_3 = torch._C._nn.linear( + mul_38, + l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_38 = l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_39 = hidden_states_35 + down_proj_3 + hidden_states_35 = down_proj_3 = None + hidden_states_40 = hidden_states_39.to(torch.float32) + pow_9 = hidden_states_40.pow(2) + variance_8 = pow_9.mean(-1, keepdim=True) + pow_9 = None + add_24 = variance_8 + 1e-05 + variance_8 = None + rsqrt_8 = torch.rsqrt(add_24) + add_24 = None + hidden_states_41 = hidden_states_40 * rsqrt_8 + hidden_states_40 = rsqrt_8 = None + to_21 = hidden_states_41.to(torch.float32) + hidden_states_41 = None + hidden_states_42 = ( + l_self_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + * to_21 + ) + l_self_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + to_21 + ) = None + linear_28 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_13 = linear_28.view((1, 3, -1, 64)) + linear_28 = None + query_states_4 = view_13.transpose(1, 2) + view_13 = None + linear_29 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_14 = linear_29.view((1, 3, -1, 64)) + linear_29 = None + key_states_4 = view_14.transpose(1, 2) + view_14 = None + linear_30 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_42 = l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_15 = linear_30.view((1, 3, -1, 64)) + linear_30 = None + value_states_4 = view_15.transpose(1, 2) + view_15 = None + cos_7 = cos_2.unsqueeze(1) + sin_7 = sin_2.unsqueeze(1) + mul_41 = query_states_4 * cos_7 + x1_8 = query_states_4[(Ellipsis, slice(None, 32, None))] + x2_8 = query_states_4[(Ellipsis, slice(32, None, None))] + query_states_4 = None + neg_8 = -x2_8 + x2_8 = None + cat_9 = torch.cat((neg_8, x1_8), dim=-1) + neg_8 = x1_8 = None + mul_42 = cat_9 * sin_7 + cat_9 = None + q_embed_4 = mul_41 + mul_42 + mul_41 = mul_42 = None + mul_43 = key_states_4 * cos_7 + cos_7 = None + x1_9 = key_states_4[(Ellipsis, slice(None, 32, None))] + x2_9 = key_states_4[(Ellipsis, slice(32, None, None))] + key_states_4 = None + neg_9 = -x2_9 + x2_9 = None + cat_10 = torch.cat((neg_9, x1_9), dim=-1) + neg_9 = x1_9 = None + mul_44 = cat_10 * sin_7 + cat_10 = sin_7 = None + k_embed_4 = mul_43 + mul_44 + mul_43 = mul_44 = None + getitem_37 = k_embed_4[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_4 = None + hidden_states_43 = getitem_37.expand(1, 4, 8, 3, 64) + getitem_37 = None + key_8 = hidden_states_43.reshape(1, 32, 3, 64) + hidden_states_43 = None + getitem_38 = value_states_4[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_4 = None + hidden_states_44 = getitem_38.expand(1, 4, 8, 3, 64) + getitem_38 = None + value_8 = hidden_states_44.reshape(1, 32, 3, 64) + hidden_states_44 = None + attention_mask_5 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_4 = q_embed_4.contiguous() + q_embed_4 = None + key_9 = key_8.contiguous() + key_8 = None + value_9 = value_8.contiguous() + value_8 = None + attn_output_16 = torch._C._nn.scaled_dot_product_attention( + query_4, + key_9, + value_9, + attn_mask=attention_mask_5, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_4 = key_9 = value_9 = attention_mask_5 = None + transpose_20 = attn_output_16.transpose(1, 2) + attn_output_16 = None + attn_output_17 = transpose_20.contiguous() + transpose_20 = None + reshape_14 = attn_output_17.reshape(1, 3, -1) + attn_output_17 = None + attn_output_18 = reshape_14.contiguous() + reshape_14 = None + attn_output_19 = torch._C._nn.linear( + attn_output_18, + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_18 = l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_45 = hidden_states_39 + attn_output_19 + hidden_states_39 = attn_output_19 = None + hidden_states_46 = hidden_states_45.to(torch.float32) + pow_10 = hidden_states_46.pow(2) + variance_9 = pow_10.mean(-1, keepdim=True) + pow_10 = None + add_28 = variance_9 + 1e-05 + variance_9 = None + rsqrt_9 = torch.rsqrt(add_28) + add_28 = None + hidden_states_47 = hidden_states_46 * rsqrt_9 + hidden_states_46 = rsqrt_9 = None + to_23 = hidden_states_47.to(torch.float32) + hidden_states_47 = None + hidden_states_48 = ( + l_self_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + * to_23 + ) + l_self_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = ( + to_23 + ) = None + linear_32 = torch._C._nn.linear( + hidden_states_48, + l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_4 = torch.nn.functional.silu(linear_32, inplace=False) + linear_32 = None + linear_33 = torch._C._nn.linear( + hidden_states_48, + l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_48 = l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_47 = silu_4 * linear_33 + silu_4 = linear_33 = None + down_proj_4 = torch._C._nn.linear( + mul_47, + l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_47 = l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_49 = hidden_states_45 + down_proj_4 + hidden_states_45 = down_proj_4 = None + hidden_states_50 = hidden_states_49.to(torch.float32) + pow_11 = hidden_states_50.pow(2) + variance_10 = pow_11.mean(-1, keepdim=True) + pow_11 = None + add_30 = variance_10 + 1e-05 + variance_10 = None + rsqrt_10 = torch.rsqrt(add_30) + add_30 = None + hidden_states_51 = hidden_states_50 * rsqrt_10 + hidden_states_50 = rsqrt_10 = None + to_25 = hidden_states_51.to(torch.float32) + hidden_states_51 = None + hidden_states_52 = ( + l_self_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + * to_25 + ) + l_self_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + to_25 + ) = None + linear_35 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_16 = linear_35.view((1, 3, -1, 64)) + linear_35 = None + query_states_5 = view_16.transpose(1, 2) + view_16 = None + linear_36 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_17 = linear_36.view((1, 3, -1, 64)) + linear_36 = None + key_states_5 = view_17.transpose(1, 2) + view_17 = None + linear_37 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_52 = l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_18 = linear_37.view((1, 3, -1, 64)) + linear_37 = None + value_states_5 = view_18.transpose(1, 2) + view_18 = None + cos_8 = cos_2.unsqueeze(1) + sin_8 = sin_2.unsqueeze(1) + mul_50 = query_states_5 * cos_8 + x1_10 = query_states_5[(Ellipsis, slice(None, 32, None))] + x2_10 = query_states_5[(Ellipsis, slice(32, None, None))] + query_states_5 = None + neg_10 = -x2_10 + x2_10 = None + cat_11 = torch.cat((neg_10, x1_10), dim=-1) + neg_10 = x1_10 = None + mul_51 = cat_11 * sin_8 + cat_11 = None + q_embed_5 = mul_50 + mul_51 + mul_50 = mul_51 = None + mul_52 = key_states_5 * cos_8 + cos_8 = None + x1_11 = key_states_5[(Ellipsis, slice(None, 32, None))] + x2_11 = key_states_5[(Ellipsis, slice(32, None, None))] + key_states_5 = None + neg_11 = -x2_11 + x2_11 = None + cat_12 = torch.cat((neg_11, x1_11), dim=-1) + neg_11 = x1_11 = None + mul_53 = cat_12 * sin_8 + cat_12 = sin_8 = None + k_embed_5 = mul_52 + mul_53 + mul_52 = mul_53 = None + getitem_44 = k_embed_5[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_5 = None + hidden_states_53 = getitem_44.expand(1, 4, 8, 3, 64) + getitem_44 = None + key_10 = hidden_states_53.reshape(1, 32, 3, 64) + hidden_states_53 = None + getitem_45 = value_states_5[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_5 = None + hidden_states_54 = getitem_45.expand(1, 4, 8, 3, 64) + getitem_45 = None + value_10 = hidden_states_54.reshape(1, 32, 3, 64) + hidden_states_54 = None + attention_mask_6 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_5 = q_embed_5.contiguous() + q_embed_5 = None + key_11 = key_10.contiguous() + key_10 = None + value_11 = value_10.contiguous() + value_10 = None + attn_output_20 = torch._C._nn.scaled_dot_product_attention( + query_5, + key_11, + value_11, + attn_mask=attention_mask_6, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_5 = key_11 = value_11 = attention_mask_6 = None + transpose_24 = attn_output_20.transpose(1, 2) + attn_output_20 = None + attn_output_21 = transpose_24.contiguous() + transpose_24 = None + reshape_17 = attn_output_21.reshape(1, 3, -1) + attn_output_21 = None + attn_output_22 = reshape_17.contiguous() + reshape_17 = None + attn_output_23 = torch._C._nn.linear( + attn_output_22, + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_22 = l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_55 = hidden_states_49 + attn_output_23 + hidden_states_49 = attn_output_23 = None + hidden_states_56 = hidden_states_55.to(torch.float32) + pow_12 = hidden_states_56.pow(2) + variance_11 = pow_12.mean(-1, keepdim=True) + pow_12 = None + add_34 = variance_11 + 1e-05 + variance_11 = None + rsqrt_11 = torch.rsqrt(add_34) + add_34 = None + hidden_states_57 = hidden_states_56 * rsqrt_11 + hidden_states_56 = rsqrt_11 = None + to_27 = hidden_states_57.to(torch.float32) + hidden_states_57 = None + hidden_states_58 = ( + l_self_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + * to_27 + ) + l_self_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = ( + to_27 + ) = None + linear_39 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_5 = torch.nn.functional.silu(linear_39, inplace=False) + linear_39 = None + linear_40 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_58 = l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_56 = silu_5 * linear_40 + silu_5 = linear_40 = None + down_proj_5 = torch._C._nn.linear( + mul_56, + l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_56 = l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_59 = hidden_states_55 + down_proj_5 + hidden_states_55 = down_proj_5 = None + hidden_states_60 = hidden_states_59.to(torch.float32) + pow_13 = hidden_states_60.pow(2) + variance_12 = pow_13.mean(-1, keepdim=True) + pow_13 = None + add_36 = variance_12 + 1e-05 + variance_12 = None + rsqrt_12 = torch.rsqrt(add_36) + add_36 = None + hidden_states_61 = hidden_states_60 * rsqrt_12 + hidden_states_60 = rsqrt_12 = None + to_29 = hidden_states_61.to(torch.float32) + hidden_states_61 = None + hidden_states_62 = ( + l_self_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + * to_29 + ) + l_self_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + to_29 + ) = None + linear_42 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_19 = linear_42.view((1, 3, -1, 64)) + linear_42 = None + query_states_6 = view_19.transpose(1, 2) + view_19 = None + linear_43 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_20 = linear_43.view((1, 3, -1, 64)) + linear_43 = None + key_states_6 = view_20.transpose(1, 2) + view_20 = None + linear_44 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_62 = l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_21 = linear_44.view((1, 3, -1, 64)) + linear_44 = None + value_states_6 = view_21.transpose(1, 2) + view_21 = None + cos_9 = cos_2.unsqueeze(1) + sin_9 = sin_2.unsqueeze(1) + mul_59 = query_states_6 * cos_9 + x1_12 = query_states_6[(Ellipsis, slice(None, 32, None))] + x2_12 = query_states_6[(Ellipsis, slice(32, None, None))] + query_states_6 = None + neg_12 = -x2_12 + x2_12 = None + cat_13 = torch.cat((neg_12, x1_12), dim=-1) + neg_12 = x1_12 = None + mul_60 = cat_13 * sin_9 + cat_13 = None + q_embed_6 = mul_59 + mul_60 + mul_59 = mul_60 = None + mul_61 = key_states_6 * cos_9 + cos_9 = None + x1_13 = key_states_6[(Ellipsis, slice(None, 32, None))] + x2_13 = key_states_6[(Ellipsis, slice(32, None, None))] + key_states_6 = None + neg_13 = -x2_13 + x2_13 = None + cat_14 = torch.cat((neg_13, x1_13), dim=-1) + neg_13 = x1_13 = None + mul_62 = cat_14 * sin_9 + cat_14 = sin_9 = None + k_embed_6 = mul_61 + mul_62 + mul_61 = mul_62 = None + getitem_51 = k_embed_6[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_6 = None + hidden_states_63 = getitem_51.expand(1, 4, 8, 3, 64) + getitem_51 = None + key_12 = hidden_states_63.reshape(1, 32, 3, 64) + hidden_states_63 = None + getitem_52 = value_states_6[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_6 = None + hidden_states_64 = getitem_52.expand(1, 4, 8, 3, 64) + getitem_52 = None + value_12 = hidden_states_64.reshape(1, 32, 3, 64) + hidden_states_64 = None + attention_mask_7 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_6 = q_embed_6.contiguous() + q_embed_6 = None + key_13 = key_12.contiguous() + key_12 = None + value_13 = value_12.contiguous() + value_12 = None + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + query_6, + key_13, + value_13, + attn_mask=attention_mask_7, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_6 = key_13 = value_13 = attention_mask_7 = None + transpose_28 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_25 = transpose_28.contiguous() + transpose_28 = None + reshape_20 = attn_output_25.reshape(1, 3, -1) + attn_output_25 = None + attn_output_26 = reshape_20.contiguous() + reshape_20 = None + attn_output_27 = torch._C._nn.linear( + attn_output_26, + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_26 = l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_65 = hidden_states_59 + attn_output_27 + hidden_states_59 = attn_output_27 = None + hidden_states_66 = hidden_states_65.to(torch.float32) + pow_14 = hidden_states_66.pow(2) + variance_13 = pow_14.mean(-1, keepdim=True) + pow_14 = None + add_40 = variance_13 + 1e-05 + variance_13 = None + rsqrt_13 = torch.rsqrt(add_40) + add_40 = None + hidden_states_67 = hidden_states_66 * rsqrt_13 + hidden_states_66 = rsqrt_13 = None + to_31 = hidden_states_67.to(torch.float32) + hidden_states_67 = None + hidden_states_68 = ( + l_self_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + * to_31 + ) + l_self_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = ( + to_31 + ) = None + linear_46 = torch._C._nn.linear( + hidden_states_68, + l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_6 = torch.nn.functional.silu(linear_46, inplace=False) + linear_46 = None + linear_47 = torch._C._nn.linear( + hidden_states_68, + l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_68 = l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_65 = silu_6 * linear_47 + silu_6 = linear_47 = None + down_proj_6 = torch._C._nn.linear( + mul_65, + l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_65 = l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_69 = hidden_states_65 + down_proj_6 + hidden_states_65 = down_proj_6 = None + hidden_states_70 = hidden_states_69.to(torch.float32) + pow_15 = hidden_states_70.pow(2) + variance_14 = pow_15.mean(-1, keepdim=True) + pow_15 = None + add_42 = variance_14 + 1e-05 + variance_14 = None + rsqrt_14 = torch.rsqrt(add_42) + add_42 = None + hidden_states_71 = hidden_states_70 * rsqrt_14 + hidden_states_70 = rsqrt_14 = None + to_33 = hidden_states_71.to(torch.float32) + hidden_states_71 = None + hidden_states_72 = ( + l_self_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + * to_33 + ) + l_self_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + to_33 + ) = None + linear_49 = torch._C._nn.linear( + hidden_states_72, + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_22 = linear_49.view((1, 3, -1, 64)) + linear_49 = None + query_states_7 = view_22.transpose(1, 2) + view_22 = None + linear_50 = torch._C._nn.linear( + hidden_states_72, + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_23 = linear_50.view((1, 3, -1, 64)) + linear_50 = None + key_states_7 = view_23.transpose(1, 2) + view_23 = None + linear_51 = torch._C._nn.linear( + hidden_states_72, + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_72 = l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_24 = linear_51.view((1, 3, -1, 64)) + linear_51 = None + value_states_7 = view_24.transpose(1, 2) + view_24 = None + cos_10 = cos_2.unsqueeze(1) + sin_10 = sin_2.unsqueeze(1) + mul_68 = query_states_7 * cos_10 + x1_14 = query_states_7[(Ellipsis, slice(None, 32, None))] + x2_14 = query_states_7[(Ellipsis, slice(32, None, None))] + query_states_7 = None + neg_14 = -x2_14 + x2_14 = None + cat_15 = torch.cat((neg_14, x1_14), dim=-1) + neg_14 = x1_14 = None + mul_69 = cat_15 * sin_10 + cat_15 = None + q_embed_7 = mul_68 + mul_69 + mul_68 = mul_69 = None + mul_70 = key_states_7 * cos_10 + cos_10 = None + x1_15 = key_states_7[(Ellipsis, slice(None, 32, None))] + x2_15 = key_states_7[(Ellipsis, slice(32, None, None))] + key_states_7 = None + neg_15 = -x2_15 + x2_15 = None + cat_16 = torch.cat((neg_15, x1_15), dim=-1) + neg_15 = x1_15 = None + mul_71 = cat_16 * sin_10 + cat_16 = sin_10 = None + k_embed_7 = mul_70 + mul_71 + mul_70 = mul_71 = None + getitem_58 = k_embed_7[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_7 = None + hidden_states_73 = getitem_58.expand(1, 4, 8, 3, 64) + getitem_58 = None + key_14 = hidden_states_73.reshape(1, 32, 3, 64) + hidden_states_73 = None + getitem_59 = value_states_7[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_7 = None + hidden_states_74 = getitem_59.expand(1, 4, 8, 3, 64) + getitem_59 = None + value_14 = hidden_states_74.reshape(1, 32, 3, 64) + hidden_states_74 = None + attention_mask_8 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_7 = q_embed_7.contiguous() + q_embed_7 = None + key_15 = key_14.contiguous() + key_14 = None + value_15 = value_14.contiguous() + value_14 = None + attn_output_28 = torch._C._nn.scaled_dot_product_attention( + query_7, + key_15, + value_15, + attn_mask=attention_mask_8, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_7 = key_15 = value_15 = attention_mask_8 = None + transpose_32 = attn_output_28.transpose(1, 2) + attn_output_28 = None + attn_output_29 = transpose_32.contiguous() + transpose_32 = None + reshape_23 = attn_output_29.reshape(1, 3, -1) + attn_output_29 = None + attn_output_30 = reshape_23.contiguous() + reshape_23 = None + attn_output_31 = torch._C._nn.linear( + attn_output_30, + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_30 = l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_75 = hidden_states_69 + attn_output_31 + hidden_states_69 = attn_output_31 = None + hidden_states_76 = hidden_states_75.to(torch.float32) + pow_16 = hidden_states_76.pow(2) + variance_15 = pow_16.mean(-1, keepdim=True) + pow_16 = None + add_46 = variance_15 + 1e-05 + variance_15 = None + rsqrt_15 = torch.rsqrt(add_46) + add_46 = None + hidden_states_77 = hidden_states_76 * rsqrt_15 + hidden_states_76 = rsqrt_15 = None + to_35 = hidden_states_77.to(torch.float32) + hidden_states_77 = None + hidden_states_78 = ( + l_self_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + * to_35 + ) + l_self_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = ( + to_35 + ) = None + linear_53 = torch._C._nn.linear( + hidden_states_78, + l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_7 = torch.nn.functional.silu(linear_53, inplace=False) + linear_53 = None + linear_54 = torch._C._nn.linear( + hidden_states_78, + l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_78 = l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_74 = silu_7 * linear_54 + silu_7 = linear_54 = None + down_proj_7 = torch._C._nn.linear( + mul_74, + l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_74 = l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_79 = hidden_states_75 + down_proj_7 + hidden_states_75 = down_proj_7 = None + hidden_states_80 = hidden_states_79.to(torch.float32) + pow_17 = hidden_states_80.pow(2) + variance_16 = pow_17.mean(-1, keepdim=True) + pow_17 = None + add_48 = variance_16 + 1e-05 + variance_16 = None + rsqrt_16 = torch.rsqrt(add_48) + add_48 = None + hidden_states_81 = hidden_states_80 * rsqrt_16 + hidden_states_80 = rsqrt_16 = None + to_37 = hidden_states_81.to(torch.float32) + hidden_states_81 = None + hidden_states_82 = ( + l_self_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + * to_37 + ) + l_self_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + to_37 + ) = None + linear_56 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_25 = linear_56.view((1, 3, -1, 64)) + linear_56 = None + query_states_8 = view_25.transpose(1, 2) + view_25 = None + linear_57 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_26 = linear_57.view((1, 3, -1, 64)) + linear_57 = None + key_states_8 = view_26.transpose(1, 2) + view_26 = None + linear_58 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_82 = l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_27 = linear_58.view((1, 3, -1, 64)) + linear_58 = None + value_states_8 = view_27.transpose(1, 2) + view_27 = None + cos_11 = cos_2.unsqueeze(1) + sin_11 = sin_2.unsqueeze(1) + mul_77 = query_states_8 * cos_11 + x1_16 = query_states_8[(Ellipsis, slice(None, 32, None))] + x2_16 = query_states_8[(Ellipsis, slice(32, None, None))] + query_states_8 = None + neg_16 = -x2_16 + x2_16 = None + cat_17 = torch.cat((neg_16, x1_16), dim=-1) + neg_16 = x1_16 = None + mul_78 = cat_17 * sin_11 + cat_17 = None + q_embed_8 = mul_77 + mul_78 + mul_77 = mul_78 = None + mul_79 = key_states_8 * cos_11 + cos_11 = None + x1_17 = key_states_8[(Ellipsis, slice(None, 32, None))] + x2_17 = key_states_8[(Ellipsis, slice(32, None, None))] + key_states_8 = None + neg_17 = -x2_17 + x2_17 = None + cat_18 = torch.cat((neg_17, x1_17), dim=-1) + neg_17 = x1_17 = None + mul_80 = cat_18 * sin_11 + cat_18 = sin_11 = None + k_embed_8 = mul_79 + mul_80 + mul_79 = mul_80 = None + getitem_65 = k_embed_8[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_8 = None + hidden_states_83 = getitem_65.expand(1, 4, 8, 3, 64) + getitem_65 = None + key_16 = hidden_states_83.reshape(1, 32, 3, 64) + hidden_states_83 = None + getitem_66 = value_states_8[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_8 = None + hidden_states_84 = getitem_66.expand(1, 4, 8, 3, 64) + getitem_66 = None + value_16 = hidden_states_84.reshape(1, 32, 3, 64) + hidden_states_84 = None + attention_mask_9 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_8 = q_embed_8.contiguous() + q_embed_8 = None + key_17 = key_16.contiguous() + key_16 = None + value_17 = value_16.contiguous() + value_16 = None + attn_output_32 = torch._C._nn.scaled_dot_product_attention( + query_8, + key_17, + value_17, + attn_mask=attention_mask_9, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_8 = key_17 = value_17 = attention_mask_9 = None + transpose_36 = attn_output_32.transpose(1, 2) + attn_output_32 = None + attn_output_33 = transpose_36.contiguous() + transpose_36 = None + reshape_26 = attn_output_33.reshape(1, 3, -1) + attn_output_33 = None + attn_output_34 = reshape_26.contiguous() + reshape_26 = None + attn_output_35 = torch._C._nn.linear( + attn_output_34, + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_34 = l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_85 = hidden_states_79 + attn_output_35 + hidden_states_79 = attn_output_35 = None + hidden_states_86 = hidden_states_85.to(torch.float32) + pow_18 = hidden_states_86.pow(2) + variance_17 = pow_18.mean(-1, keepdim=True) + pow_18 = None + add_52 = variance_17 + 1e-05 + variance_17 = None + rsqrt_17 = torch.rsqrt(add_52) + add_52 = None + hidden_states_87 = hidden_states_86 * rsqrt_17 + hidden_states_86 = rsqrt_17 = None + to_39 = hidden_states_87.to(torch.float32) + hidden_states_87 = None + hidden_states_88 = ( + l_self_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + * to_39 + ) + l_self_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = ( + to_39 + ) = None + linear_60 = torch._C._nn.linear( + hidden_states_88, + l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_8 = torch.nn.functional.silu(linear_60, inplace=False) + linear_60 = None + linear_61 = torch._C._nn.linear( + hidden_states_88, + l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_88 = l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_83 = silu_8 * linear_61 + silu_8 = linear_61 = None + down_proj_8 = torch._C._nn.linear( + mul_83, + l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_83 = l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_89 = hidden_states_85 + down_proj_8 + hidden_states_85 = down_proj_8 = None + hidden_states_90 = hidden_states_89.to(torch.float32) + pow_19 = hidden_states_90.pow(2) + variance_18 = pow_19.mean(-1, keepdim=True) + pow_19 = None + add_54 = variance_18 + 1e-05 + variance_18 = None + rsqrt_18 = torch.rsqrt(add_54) + add_54 = None + hidden_states_91 = hidden_states_90 * rsqrt_18 + hidden_states_90 = rsqrt_18 = None + to_41 = hidden_states_91.to(torch.float32) + hidden_states_91 = None + hidden_states_92 = ( + l_self_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + * to_41 + ) + l_self_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + to_41 + ) = None + linear_63 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_28 = linear_63.view((1, 3, -1, 64)) + linear_63 = None + query_states_9 = view_28.transpose(1, 2) + view_28 = None + linear_64 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_29 = linear_64.view((1, 3, -1, 64)) + linear_64 = None + key_states_9 = view_29.transpose(1, 2) + view_29 = None + linear_65 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_92 = l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_30 = linear_65.view((1, 3, -1, 64)) + linear_65 = None + value_states_9 = view_30.transpose(1, 2) + view_30 = None + cos_12 = cos_2.unsqueeze(1) + sin_12 = sin_2.unsqueeze(1) + mul_86 = query_states_9 * cos_12 + x1_18 = query_states_9[(Ellipsis, slice(None, 32, None))] + x2_18 = query_states_9[(Ellipsis, slice(32, None, None))] + query_states_9 = None + neg_18 = -x2_18 + x2_18 = None + cat_19 = torch.cat((neg_18, x1_18), dim=-1) + neg_18 = x1_18 = None + mul_87 = cat_19 * sin_12 + cat_19 = None + q_embed_9 = mul_86 + mul_87 + mul_86 = mul_87 = None + mul_88 = key_states_9 * cos_12 + cos_12 = None + x1_19 = key_states_9[(Ellipsis, slice(None, 32, None))] + x2_19 = key_states_9[(Ellipsis, slice(32, None, None))] + key_states_9 = None + neg_19 = -x2_19 + x2_19 = None + cat_20 = torch.cat((neg_19, x1_19), dim=-1) + neg_19 = x1_19 = None + mul_89 = cat_20 * sin_12 + cat_20 = sin_12 = None + k_embed_9 = mul_88 + mul_89 + mul_88 = mul_89 = None + getitem_72 = k_embed_9[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_9 = None + hidden_states_93 = getitem_72.expand(1, 4, 8, 3, 64) + getitem_72 = None + key_18 = hidden_states_93.reshape(1, 32, 3, 64) + hidden_states_93 = None + getitem_73 = value_states_9[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_9 = None + hidden_states_94 = getitem_73.expand(1, 4, 8, 3, 64) + getitem_73 = None + value_18 = hidden_states_94.reshape(1, 32, 3, 64) + hidden_states_94 = None + attention_mask_10 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_9 = q_embed_9.contiguous() + q_embed_9 = None + key_19 = key_18.contiguous() + key_18 = None + value_19 = value_18.contiguous() + value_18 = None + attn_output_36 = torch._C._nn.scaled_dot_product_attention( + query_9, + key_19, + value_19, + attn_mask=attention_mask_10, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_9 = key_19 = value_19 = attention_mask_10 = None + transpose_40 = attn_output_36.transpose(1, 2) + attn_output_36 = None + attn_output_37 = transpose_40.contiguous() + transpose_40 = None + reshape_29 = attn_output_37.reshape(1, 3, -1) + attn_output_37 = None + attn_output_38 = reshape_29.contiguous() + reshape_29 = None + attn_output_39 = torch._C._nn.linear( + attn_output_38, + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_38 = l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_95 = hidden_states_89 + attn_output_39 + hidden_states_89 = attn_output_39 = None + hidden_states_96 = hidden_states_95.to(torch.float32) + pow_20 = hidden_states_96.pow(2) + variance_19 = pow_20.mean(-1, keepdim=True) + pow_20 = None + add_58 = variance_19 + 1e-05 + variance_19 = None + rsqrt_19 = torch.rsqrt(add_58) + add_58 = None + hidden_states_97 = hidden_states_96 * rsqrt_19 + hidden_states_96 = rsqrt_19 = None + to_43 = hidden_states_97.to(torch.float32) + hidden_states_97 = None + hidden_states_98 = ( + l_self_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + * to_43 + ) + l_self_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = ( + to_43 + ) = None + linear_67 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_9 = torch.nn.functional.silu(linear_67, inplace=False) + linear_67 = None + linear_68 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_98 = l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_92 = silu_9 * linear_68 + silu_9 = linear_68 = None + down_proj_9 = torch._C._nn.linear( + mul_92, + l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_92 = l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_99 = hidden_states_95 + down_proj_9 + hidden_states_95 = down_proj_9 = None + hidden_states_100 = hidden_states_99.to(torch.float32) + pow_21 = hidden_states_100.pow(2) + variance_20 = pow_21.mean(-1, keepdim=True) + pow_21 = None + add_60 = variance_20 + 1e-05 + variance_20 = None + rsqrt_20 = torch.rsqrt(add_60) + add_60 = None + hidden_states_101 = hidden_states_100 * rsqrt_20 + hidden_states_100 = rsqrt_20 = None + to_45 = hidden_states_101.to(torch.float32) + hidden_states_101 = None + hidden_states_102 = ( + l_self_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + * to_45 + ) + l_self_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + to_45 + ) = None + linear_70 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_31 = linear_70.view((1, 3, -1, 64)) + linear_70 = None + query_states_10 = view_31.transpose(1, 2) + view_31 = None + linear_71 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_32 = linear_71.view((1, 3, -1, 64)) + linear_71 = None + key_states_10 = view_32.transpose(1, 2) + view_32 = None + linear_72 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_102 = l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_33 = linear_72.view((1, 3, -1, 64)) + linear_72 = None + value_states_10 = view_33.transpose(1, 2) + view_33 = None + cos_13 = cos_2.unsqueeze(1) + sin_13 = sin_2.unsqueeze(1) + mul_95 = query_states_10 * cos_13 + x1_20 = query_states_10[(Ellipsis, slice(None, 32, None))] + x2_20 = query_states_10[(Ellipsis, slice(32, None, None))] + query_states_10 = None + neg_20 = -x2_20 + x2_20 = None + cat_21 = torch.cat((neg_20, x1_20), dim=-1) + neg_20 = x1_20 = None + mul_96 = cat_21 * sin_13 + cat_21 = None + q_embed_10 = mul_95 + mul_96 + mul_95 = mul_96 = None + mul_97 = key_states_10 * cos_13 + cos_13 = None + x1_21 = key_states_10[(Ellipsis, slice(None, 32, None))] + x2_21 = key_states_10[(Ellipsis, slice(32, None, None))] + key_states_10 = None + neg_21 = -x2_21 + x2_21 = None + cat_22 = torch.cat((neg_21, x1_21), dim=-1) + neg_21 = x1_21 = None + mul_98 = cat_22 * sin_13 + cat_22 = sin_13 = None + k_embed_10 = mul_97 + mul_98 + mul_97 = mul_98 = None + getitem_79 = k_embed_10[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_10 = None + hidden_states_103 = getitem_79.expand(1, 4, 8, 3, 64) + getitem_79 = None + key_20 = hidden_states_103.reshape(1, 32, 3, 64) + hidden_states_103 = None + getitem_80 = value_states_10[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_10 = None + hidden_states_104 = getitem_80.expand(1, 4, 8, 3, 64) + getitem_80 = None + value_20 = hidden_states_104.reshape(1, 32, 3, 64) + hidden_states_104 = None + attention_mask_11 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_10 = q_embed_10.contiguous() + q_embed_10 = None + key_21 = key_20.contiguous() + key_20 = None + value_21 = value_20.contiguous() + value_20 = None + attn_output_40 = torch._C._nn.scaled_dot_product_attention( + query_10, + key_21, + value_21, + attn_mask=attention_mask_11, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_10 = key_21 = value_21 = attention_mask_11 = None + transpose_44 = attn_output_40.transpose(1, 2) + attn_output_40 = None + attn_output_41 = transpose_44.contiguous() + transpose_44 = None + reshape_32 = attn_output_41.reshape(1, 3, -1) + attn_output_41 = None + attn_output_42 = reshape_32.contiguous() + reshape_32 = None + attn_output_43 = torch._C._nn.linear( + attn_output_42, + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_42 = l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_105 = hidden_states_99 + attn_output_43 + hidden_states_99 = attn_output_43 = None + hidden_states_106 = hidden_states_105.to(torch.float32) + pow_22 = hidden_states_106.pow(2) + variance_21 = pow_22.mean(-1, keepdim=True) + pow_22 = None + add_64 = variance_21 + 1e-05 + variance_21 = None + rsqrt_21 = torch.rsqrt(add_64) + add_64 = None + hidden_states_107 = hidden_states_106 * rsqrt_21 + hidden_states_106 = rsqrt_21 = None + to_47 = hidden_states_107.to(torch.float32) + hidden_states_107 = None + hidden_states_108 = ( + l_self_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + * to_47 + ) + l_self_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = ( + to_47 + ) = None + linear_74 = torch._C._nn.linear( + hidden_states_108, + l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_10 = torch.nn.functional.silu(linear_74, inplace=False) + linear_74 = None + linear_75 = torch._C._nn.linear( + hidden_states_108, + l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_108 = l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_101 = silu_10 * linear_75 + silu_10 = linear_75 = None + down_proj_10 = torch._C._nn.linear( + mul_101, + l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_101 = l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_109 = hidden_states_105 + down_proj_10 + hidden_states_105 = down_proj_10 = None + hidden_states_110 = hidden_states_109.to(torch.float32) + pow_23 = hidden_states_110.pow(2) + variance_22 = pow_23.mean(-1, keepdim=True) + pow_23 = None + add_66 = variance_22 + 1e-05 + variance_22 = None + rsqrt_22 = torch.rsqrt(add_66) + add_66 = None + hidden_states_111 = hidden_states_110 * rsqrt_22 + hidden_states_110 = rsqrt_22 = None + to_49 = hidden_states_111.to(torch.float32) + hidden_states_111 = None + hidden_states_112 = ( + l_self_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + * to_49 + ) + l_self_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + to_49 + ) = None + linear_77 = torch._C._nn.linear( + hidden_states_112, + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_34 = linear_77.view((1, 3, -1, 64)) + linear_77 = None + query_states_11 = view_34.transpose(1, 2) + view_34 = None + linear_78 = torch._C._nn.linear( + hidden_states_112, + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_35 = linear_78.view((1, 3, -1, 64)) + linear_78 = None + key_states_11 = view_35.transpose(1, 2) + view_35 = None + linear_79 = torch._C._nn.linear( + hidden_states_112, + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_112 = l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_36 = linear_79.view((1, 3, -1, 64)) + linear_79 = None + value_states_11 = view_36.transpose(1, 2) + view_36 = None + cos_14 = cos_2.unsqueeze(1) + sin_14 = sin_2.unsqueeze(1) + mul_104 = query_states_11 * cos_14 + x1_22 = query_states_11[(Ellipsis, slice(None, 32, None))] + x2_22 = query_states_11[(Ellipsis, slice(32, None, None))] + query_states_11 = None + neg_22 = -x2_22 + x2_22 = None + cat_23 = torch.cat((neg_22, x1_22), dim=-1) + neg_22 = x1_22 = None + mul_105 = cat_23 * sin_14 + cat_23 = None + q_embed_11 = mul_104 + mul_105 + mul_104 = mul_105 = None + mul_106 = key_states_11 * cos_14 + cos_14 = None + x1_23 = key_states_11[(Ellipsis, slice(None, 32, None))] + x2_23 = key_states_11[(Ellipsis, slice(32, None, None))] + key_states_11 = None + neg_23 = -x2_23 + x2_23 = None + cat_24 = torch.cat((neg_23, x1_23), dim=-1) + neg_23 = x1_23 = None + mul_107 = cat_24 * sin_14 + cat_24 = sin_14 = None + k_embed_11 = mul_106 + mul_107 + mul_106 = mul_107 = None + getitem_86 = k_embed_11[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_11 = None + hidden_states_113 = getitem_86.expand(1, 4, 8, 3, 64) + getitem_86 = None + key_22 = hidden_states_113.reshape(1, 32, 3, 64) + hidden_states_113 = None + getitem_87 = value_states_11[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_11 = None + hidden_states_114 = getitem_87.expand(1, 4, 8, 3, 64) + getitem_87 = None + value_22 = hidden_states_114.reshape(1, 32, 3, 64) + hidden_states_114 = None + attention_mask_12 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_11 = q_embed_11.contiguous() + q_embed_11 = None + key_23 = key_22.contiguous() + key_22 = None + value_23 = value_22.contiguous() + value_22 = None + attn_output_44 = torch._C._nn.scaled_dot_product_attention( + query_11, + key_23, + value_23, + attn_mask=attention_mask_12, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_11 = key_23 = value_23 = attention_mask_12 = None + transpose_48 = attn_output_44.transpose(1, 2) + attn_output_44 = None + attn_output_45 = transpose_48.contiguous() + transpose_48 = None + reshape_35 = attn_output_45.reshape(1, 3, -1) + attn_output_45 = None + attn_output_46 = reshape_35.contiguous() + reshape_35 = None + attn_output_47 = torch._C._nn.linear( + attn_output_46, + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_46 = l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_115 = hidden_states_109 + attn_output_47 + hidden_states_109 = attn_output_47 = None + hidden_states_116 = hidden_states_115.to(torch.float32) + pow_24 = hidden_states_116.pow(2) + variance_23 = pow_24.mean(-1, keepdim=True) + pow_24 = None + add_70 = variance_23 + 1e-05 + variance_23 = None + rsqrt_23 = torch.rsqrt(add_70) + add_70 = None + hidden_states_117 = hidden_states_116 * rsqrt_23 + hidden_states_116 = rsqrt_23 = None + to_51 = hidden_states_117.to(torch.float32) + hidden_states_117 = None + hidden_states_118 = ( + l_self_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + * to_51 + ) + l_self_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = ( + to_51 + ) = None + linear_81 = torch._C._nn.linear( + hidden_states_118, + l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_11 = torch.nn.functional.silu(linear_81, inplace=False) + linear_81 = None + linear_82 = torch._C._nn.linear( + hidden_states_118, + l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_118 = l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_110 = silu_11 * linear_82 + silu_11 = linear_82 = None + down_proj_11 = torch._C._nn.linear( + mul_110, + l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_110 = l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_119 = hidden_states_115 + down_proj_11 + hidden_states_115 = down_proj_11 = None + hidden_states_120 = hidden_states_119.to(torch.float32) + pow_25 = hidden_states_120.pow(2) + variance_24 = pow_25.mean(-1, keepdim=True) + pow_25 = None + add_72 = variance_24 + 1e-05 + variance_24 = None + rsqrt_24 = torch.rsqrt(add_72) + add_72 = None + hidden_states_121 = hidden_states_120 * rsqrt_24 + hidden_states_120 = rsqrt_24 = None + to_53 = hidden_states_121.to(torch.float32) + hidden_states_121 = None + hidden_states_122 = ( + l_self_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + * to_53 + ) + l_self_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + to_53 + ) = None + linear_84 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_37 = linear_84.view((1, 3, -1, 64)) + linear_84 = None + query_states_12 = view_37.transpose(1, 2) + view_37 = None + linear_85 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_38 = linear_85.view((1, 3, -1, 64)) + linear_85 = None + key_states_12 = view_38.transpose(1, 2) + view_38 = None + linear_86 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_122 = l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_39 = linear_86.view((1, 3, -1, 64)) + linear_86 = None + value_states_12 = view_39.transpose(1, 2) + view_39 = None + cos_15 = cos_2.unsqueeze(1) + sin_15 = sin_2.unsqueeze(1) + mul_113 = query_states_12 * cos_15 + x1_24 = query_states_12[(Ellipsis, slice(None, 32, None))] + x2_24 = query_states_12[(Ellipsis, slice(32, None, None))] + query_states_12 = None + neg_24 = -x2_24 + x2_24 = None + cat_25 = torch.cat((neg_24, x1_24), dim=-1) + neg_24 = x1_24 = None + mul_114 = cat_25 * sin_15 + cat_25 = None + q_embed_12 = mul_113 + mul_114 + mul_113 = mul_114 = None + mul_115 = key_states_12 * cos_15 + cos_15 = None + x1_25 = key_states_12[(Ellipsis, slice(None, 32, None))] + x2_25 = key_states_12[(Ellipsis, slice(32, None, None))] + key_states_12 = None + neg_25 = -x2_25 + x2_25 = None + cat_26 = torch.cat((neg_25, x1_25), dim=-1) + neg_25 = x1_25 = None + mul_116 = cat_26 * sin_15 + cat_26 = sin_15 = None + k_embed_12 = mul_115 + mul_116 + mul_115 = mul_116 = None + getitem_93 = k_embed_12[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_12 = None + hidden_states_123 = getitem_93.expand(1, 4, 8, 3, 64) + getitem_93 = None + key_24 = hidden_states_123.reshape(1, 32, 3, 64) + hidden_states_123 = None + getitem_94 = value_states_12[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_12 = None + hidden_states_124 = getitem_94.expand(1, 4, 8, 3, 64) + getitem_94 = None + value_24 = hidden_states_124.reshape(1, 32, 3, 64) + hidden_states_124 = None + attention_mask_13 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_12 = q_embed_12.contiguous() + q_embed_12 = None + key_25 = key_24.contiguous() + key_24 = None + value_25 = value_24.contiguous() + value_24 = None + attn_output_48 = torch._C._nn.scaled_dot_product_attention( + query_12, + key_25, + value_25, + attn_mask=attention_mask_13, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_12 = key_25 = value_25 = attention_mask_13 = None + transpose_52 = attn_output_48.transpose(1, 2) + attn_output_48 = None + attn_output_49 = transpose_52.contiguous() + transpose_52 = None + reshape_38 = attn_output_49.reshape(1, 3, -1) + attn_output_49 = None + attn_output_50 = reshape_38.contiguous() + reshape_38 = None + attn_output_51 = torch._C._nn.linear( + attn_output_50, + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_50 = l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_125 = hidden_states_119 + attn_output_51 + hidden_states_119 = attn_output_51 = None + hidden_states_126 = hidden_states_125.to(torch.float32) + pow_26 = hidden_states_126.pow(2) + variance_25 = pow_26.mean(-1, keepdim=True) + pow_26 = None + add_76 = variance_25 + 1e-05 + variance_25 = None + rsqrt_25 = torch.rsqrt(add_76) + add_76 = None + hidden_states_127 = hidden_states_126 * rsqrt_25 + hidden_states_126 = rsqrt_25 = None + to_55 = hidden_states_127.to(torch.float32) + hidden_states_127 = None + hidden_states_128 = ( + l_self_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + * to_55 + ) + l_self_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = ( + to_55 + ) = None + linear_88 = torch._C._nn.linear( + hidden_states_128, + l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_12 = torch.nn.functional.silu(linear_88, inplace=False) + linear_88 = None + linear_89 = torch._C._nn.linear( + hidden_states_128, + l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_128 = l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_119 = silu_12 * linear_89 + silu_12 = linear_89 = None + down_proj_12 = torch._C._nn.linear( + mul_119, + l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_119 = l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_129 = hidden_states_125 + down_proj_12 + hidden_states_125 = down_proj_12 = None + hidden_states_130 = hidden_states_129.to(torch.float32) + pow_27 = hidden_states_130.pow(2) + variance_26 = pow_27.mean(-1, keepdim=True) + pow_27 = None + add_78 = variance_26 + 1e-05 + variance_26 = None + rsqrt_26 = torch.rsqrt(add_78) + add_78 = None + hidden_states_131 = hidden_states_130 * rsqrt_26 + hidden_states_130 = rsqrt_26 = None + to_57 = hidden_states_131.to(torch.float32) + hidden_states_131 = None + hidden_states_132 = ( + l_self_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + * to_57 + ) + l_self_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + to_57 + ) = None + linear_91 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_40 = linear_91.view((1, 3, -1, 64)) + linear_91 = None + query_states_13 = view_40.transpose(1, 2) + view_40 = None + linear_92 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_41 = linear_92.view((1, 3, -1, 64)) + linear_92 = None + key_states_13 = view_41.transpose(1, 2) + view_41 = None + linear_93 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_132 = l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_42 = linear_93.view((1, 3, -1, 64)) + linear_93 = None + value_states_13 = view_42.transpose(1, 2) + view_42 = None + cos_16 = cos_2.unsqueeze(1) + sin_16 = sin_2.unsqueeze(1) + mul_122 = query_states_13 * cos_16 + x1_26 = query_states_13[(Ellipsis, slice(None, 32, None))] + x2_26 = query_states_13[(Ellipsis, slice(32, None, None))] + query_states_13 = None + neg_26 = -x2_26 + x2_26 = None + cat_27 = torch.cat((neg_26, x1_26), dim=-1) + neg_26 = x1_26 = None + mul_123 = cat_27 * sin_16 + cat_27 = None + q_embed_13 = mul_122 + mul_123 + mul_122 = mul_123 = None + mul_124 = key_states_13 * cos_16 + cos_16 = None + x1_27 = key_states_13[(Ellipsis, slice(None, 32, None))] + x2_27 = key_states_13[(Ellipsis, slice(32, None, None))] + key_states_13 = None + neg_27 = -x2_27 + x2_27 = None + cat_28 = torch.cat((neg_27, x1_27), dim=-1) + neg_27 = x1_27 = None + mul_125 = cat_28 * sin_16 + cat_28 = sin_16 = None + k_embed_13 = mul_124 + mul_125 + mul_124 = mul_125 = None + getitem_100 = k_embed_13[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_13 = None + hidden_states_133 = getitem_100.expand(1, 4, 8, 3, 64) + getitem_100 = None + key_26 = hidden_states_133.reshape(1, 32, 3, 64) + hidden_states_133 = None + getitem_101 = value_states_13[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_13 = None + hidden_states_134 = getitem_101.expand(1, 4, 8, 3, 64) + getitem_101 = None + value_26 = hidden_states_134.reshape(1, 32, 3, 64) + hidden_states_134 = None + attention_mask_14 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_13 = q_embed_13.contiguous() + q_embed_13 = None + key_27 = key_26.contiguous() + key_26 = None + value_27 = value_26.contiguous() + value_26 = None + attn_output_52 = torch._C._nn.scaled_dot_product_attention( + query_13, + key_27, + value_27, + attn_mask=attention_mask_14, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_13 = key_27 = value_27 = attention_mask_14 = None + transpose_56 = attn_output_52.transpose(1, 2) + attn_output_52 = None + attn_output_53 = transpose_56.contiguous() + transpose_56 = None + reshape_41 = attn_output_53.reshape(1, 3, -1) + attn_output_53 = None + attn_output_54 = reshape_41.contiguous() + reshape_41 = None + attn_output_55 = torch._C._nn.linear( + attn_output_54, + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_54 = l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_135 = hidden_states_129 + attn_output_55 + hidden_states_129 = attn_output_55 = None + hidden_states_136 = hidden_states_135.to(torch.float32) + pow_28 = hidden_states_136.pow(2) + variance_27 = pow_28.mean(-1, keepdim=True) + pow_28 = None + add_82 = variance_27 + 1e-05 + variance_27 = None + rsqrt_27 = torch.rsqrt(add_82) + add_82 = None + hidden_states_137 = hidden_states_136 * rsqrt_27 + hidden_states_136 = rsqrt_27 = None + to_59 = hidden_states_137.to(torch.float32) + hidden_states_137 = None + hidden_states_138 = ( + l_self_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + * to_59 + ) + l_self_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = ( + to_59 + ) = None + linear_95 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_13 = torch.nn.functional.silu(linear_95, inplace=False) + linear_95 = None + linear_96 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_138 = l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_128 = silu_13 * linear_96 + silu_13 = linear_96 = None + down_proj_13 = torch._C._nn.linear( + mul_128, + l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_128 = l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_139 = hidden_states_135 + down_proj_13 + hidden_states_135 = down_proj_13 = None + hidden_states_140 = hidden_states_139.to(torch.float32) + pow_29 = hidden_states_140.pow(2) + variance_28 = pow_29.mean(-1, keepdim=True) + pow_29 = None + add_84 = variance_28 + 1e-05 + variance_28 = None + rsqrt_28 = torch.rsqrt(add_84) + add_84 = None + hidden_states_141 = hidden_states_140 * rsqrt_28 + hidden_states_140 = rsqrt_28 = None + to_61 = hidden_states_141.to(torch.float32) + hidden_states_141 = None + hidden_states_142 = ( + l_self_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + * to_61 + ) + l_self_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + to_61 + ) = None + linear_98 = torch._C._nn.linear( + hidden_states_142, + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_43 = linear_98.view((1, 3, -1, 64)) + linear_98 = None + query_states_14 = view_43.transpose(1, 2) + view_43 = None + linear_99 = torch._C._nn.linear( + hidden_states_142, + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_44 = linear_99.view((1, 3, -1, 64)) + linear_99 = None + key_states_14 = view_44.transpose(1, 2) + view_44 = None + linear_100 = torch._C._nn.linear( + hidden_states_142, + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_142 = l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_45 = linear_100.view((1, 3, -1, 64)) + linear_100 = None + value_states_14 = view_45.transpose(1, 2) + view_45 = None + cos_17 = cos_2.unsqueeze(1) + sin_17 = sin_2.unsqueeze(1) + mul_131 = query_states_14 * cos_17 + x1_28 = query_states_14[(Ellipsis, slice(None, 32, None))] + x2_28 = query_states_14[(Ellipsis, slice(32, None, None))] + query_states_14 = None + neg_28 = -x2_28 + x2_28 = None + cat_29 = torch.cat((neg_28, x1_28), dim=-1) + neg_28 = x1_28 = None + mul_132 = cat_29 * sin_17 + cat_29 = None + q_embed_14 = mul_131 + mul_132 + mul_131 = mul_132 = None + mul_133 = key_states_14 * cos_17 + cos_17 = None + x1_29 = key_states_14[(Ellipsis, slice(None, 32, None))] + x2_29 = key_states_14[(Ellipsis, slice(32, None, None))] + key_states_14 = None + neg_29 = -x2_29 + x2_29 = None + cat_30 = torch.cat((neg_29, x1_29), dim=-1) + neg_29 = x1_29 = None + mul_134 = cat_30 * sin_17 + cat_30 = sin_17 = None + k_embed_14 = mul_133 + mul_134 + mul_133 = mul_134 = None + getitem_107 = k_embed_14[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_14 = None + hidden_states_143 = getitem_107.expand(1, 4, 8, 3, 64) + getitem_107 = None + key_28 = hidden_states_143.reshape(1, 32, 3, 64) + hidden_states_143 = None + getitem_108 = value_states_14[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_14 = None + hidden_states_144 = getitem_108.expand(1, 4, 8, 3, 64) + getitem_108 = None + value_28 = hidden_states_144.reshape(1, 32, 3, 64) + hidden_states_144 = None + attention_mask_15 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_14 = q_embed_14.contiguous() + q_embed_14 = None + key_29 = key_28.contiguous() + key_28 = None + value_29 = value_28.contiguous() + value_28 = None + attn_output_56 = torch._C._nn.scaled_dot_product_attention( + query_14, + key_29, + value_29, + attn_mask=attention_mask_15, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_14 = key_29 = value_29 = attention_mask_15 = None + transpose_60 = attn_output_56.transpose(1, 2) + attn_output_56 = None + attn_output_57 = transpose_60.contiguous() + transpose_60 = None + reshape_44 = attn_output_57.reshape(1, 3, -1) + attn_output_57 = None + attn_output_58 = reshape_44.contiguous() + reshape_44 = None + attn_output_59 = torch._C._nn.linear( + attn_output_58, + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_58 = l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_145 = hidden_states_139 + attn_output_59 + hidden_states_139 = attn_output_59 = None + hidden_states_146 = hidden_states_145.to(torch.float32) + pow_30 = hidden_states_146.pow(2) + variance_29 = pow_30.mean(-1, keepdim=True) + pow_30 = None + add_88 = variance_29 + 1e-05 + variance_29 = None + rsqrt_29 = torch.rsqrt(add_88) + add_88 = None + hidden_states_147 = hidden_states_146 * rsqrt_29 + hidden_states_146 = rsqrt_29 = None + to_63 = hidden_states_147.to(torch.float32) + hidden_states_147 = None + hidden_states_148 = ( + l_self_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + * to_63 + ) + l_self_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = ( + to_63 + ) = None + linear_102 = torch._C._nn.linear( + hidden_states_148, + l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_14 = torch.nn.functional.silu(linear_102, inplace=False) + linear_102 = None + linear_103 = torch._C._nn.linear( + hidden_states_148, + l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_148 = l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_137 = silu_14 * linear_103 + silu_14 = linear_103 = None + down_proj_14 = torch._C._nn.linear( + mul_137, + l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_137 = l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_149 = hidden_states_145 + down_proj_14 + hidden_states_145 = down_proj_14 = None + hidden_states_150 = hidden_states_149.to(torch.float32) + pow_31 = hidden_states_150.pow(2) + variance_30 = pow_31.mean(-1, keepdim=True) + pow_31 = None + add_90 = variance_30 + 1e-05 + variance_30 = None + rsqrt_30 = torch.rsqrt(add_90) + add_90 = None + hidden_states_151 = hidden_states_150 * rsqrt_30 + hidden_states_150 = rsqrt_30 = None + to_65 = hidden_states_151.to(torch.float32) + hidden_states_151 = None + hidden_states_152 = ( + l_self_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + * to_65 + ) + l_self_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + to_65 + ) = None + linear_105 = torch._C._nn.linear( + hidden_states_152, + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_46 = linear_105.view((1, 3, -1, 64)) + linear_105 = None + query_states_15 = view_46.transpose(1, 2) + view_46 = None + linear_106 = torch._C._nn.linear( + hidden_states_152, + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_47 = linear_106.view((1, 3, -1, 64)) + linear_106 = None + key_states_15 = view_47.transpose(1, 2) + view_47 = None + linear_107 = torch._C._nn.linear( + hidden_states_152, + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_152 = l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_48 = linear_107.view((1, 3, -1, 64)) + linear_107 = None + value_states_15 = view_48.transpose(1, 2) + view_48 = None + cos_18 = cos_2.unsqueeze(1) + sin_18 = sin_2.unsqueeze(1) + mul_140 = query_states_15 * cos_18 + x1_30 = query_states_15[(Ellipsis, slice(None, 32, None))] + x2_30 = query_states_15[(Ellipsis, slice(32, None, None))] + query_states_15 = None + neg_30 = -x2_30 + x2_30 = None + cat_31 = torch.cat((neg_30, x1_30), dim=-1) + neg_30 = x1_30 = None + mul_141 = cat_31 * sin_18 + cat_31 = None + q_embed_15 = mul_140 + mul_141 + mul_140 = mul_141 = None + mul_142 = key_states_15 * cos_18 + cos_18 = None + x1_31 = key_states_15[(Ellipsis, slice(None, 32, None))] + x2_31 = key_states_15[(Ellipsis, slice(32, None, None))] + key_states_15 = None + neg_31 = -x2_31 + x2_31 = None + cat_32 = torch.cat((neg_31, x1_31), dim=-1) + neg_31 = x1_31 = None + mul_143 = cat_32 * sin_18 + cat_32 = sin_18 = None + k_embed_15 = mul_142 + mul_143 + mul_142 = mul_143 = None + getitem_114 = k_embed_15[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_15 = None + hidden_states_153 = getitem_114.expand(1, 4, 8, 3, 64) + getitem_114 = None + key_30 = hidden_states_153.reshape(1, 32, 3, 64) + hidden_states_153 = None + getitem_115 = value_states_15[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_15 = None + hidden_states_154 = getitem_115.expand(1, 4, 8, 3, 64) + getitem_115 = None + value_30 = hidden_states_154.reshape(1, 32, 3, 64) + hidden_states_154 = None + attention_mask_16 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_15 = q_embed_15.contiguous() + q_embed_15 = None + key_31 = key_30.contiguous() + key_30 = None + value_31 = value_30.contiguous() + value_30 = None + attn_output_60 = torch._C._nn.scaled_dot_product_attention( + query_15, + key_31, + value_31, + attn_mask=attention_mask_16, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_15 = key_31 = value_31 = attention_mask_16 = None + transpose_64 = attn_output_60.transpose(1, 2) + attn_output_60 = None + attn_output_61 = transpose_64.contiguous() + transpose_64 = None + reshape_47 = attn_output_61.reshape(1, 3, -1) + attn_output_61 = None + attn_output_62 = reshape_47.contiguous() + reshape_47 = None + attn_output_63 = torch._C._nn.linear( + attn_output_62, + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_62 = l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_155 = hidden_states_149 + attn_output_63 + hidden_states_149 = attn_output_63 = None + hidden_states_156 = hidden_states_155.to(torch.float32) + pow_32 = hidden_states_156.pow(2) + variance_31 = pow_32.mean(-1, keepdim=True) + pow_32 = None + add_94 = variance_31 + 1e-05 + variance_31 = None + rsqrt_31 = torch.rsqrt(add_94) + add_94 = None + hidden_states_157 = hidden_states_156 * rsqrt_31 + hidden_states_156 = rsqrt_31 = None + to_67 = hidden_states_157.to(torch.float32) + hidden_states_157 = None + hidden_states_158 = ( + l_self_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + * to_67 + ) + l_self_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = ( + to_67 + ) = None + linear_109 = torch._C._nn.linear( + hidden_states_158, + l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_15 = torch.nn.functional.silu(linear_109, inplace=False) + linear_109 = None + linear_110 = torch._C._nn.linear( + hidden_states_158, + l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_158 = l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_146 = silu_15 * linear_110 + silu_15 = linear_110 = None + down_proj_15 = torch._C._nn.linear( + mul_146, + l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_146 = l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_159 = hidden_states_155 + down_proj_15 + hidden_states_155 = down_proj_15 = None + hidden_states_160 = hidden_states_159.to(torch.float32) + pow_33 = hidden_states_160.pow(2) + variance_32 = pow_33.mean(-1, keepdim=True) + pow_33 = None + add_96 = variance_32 + 1e-05 + variance_32 = None + rsqrt_32 = torch.rsqrt(add_96) + add_96 = None + hidden_states_161 = hidden_states_160 * rsqrt_32 + hidden_states_160 = rsqrt_32 = None + to_69 = hidden_states_161.to(torch.float32) + hidden_states_161 = None + hidden_states_162 = ( + l_self_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + * to_69 + ) + l_self_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + to_69 + ) = None + linear_112 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_49 = linear_112.view((1, 3, -1, 64)) + linear_112 = None + query_states_16 = view_49.transpose(1, 2) + view_49 = None + linear_113 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_50 = linear_113.view((1, 3, -1, 64)) + linear_113 = None + key_states_16 = view_50.transpose(1, 2) + view_50 = None + linear_114 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_162 = l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_51 = linear_114.view((1, 3, -1, 64)) + linear_114 = None + value_states_16 = view_51.transpose(1, 2) + view_51 = None + cos_19 = cos_2.unsqueeze(1) + sin_19 = sin_2.unsqueeze(1) + mul_149 = query_states_16 * cos_19 + x1_32 = query_states_16[(Ellipsis, slice(None, 32, None))] + x2_32 = query_states_16[(Ellipsis, slice(32, None, None))] + query_states_16 = None + neg_32 = -x2_32 + x2_32 = None + cat_33 = torch.cat((neg_32, x1_32), dim=-1) + neg_32 = x1_32 = None + mul_150 = cat_33 * sin_19 + cat_33 = None + q_embed_16 = mul_149 + mul_150 + mul_149 = mul_150 = None + mul_151 = key_states_16 * cos_19 + cos_19 = None + x1_33 = key_states_16[(Ellipsis, slice(None, 32, None))] + x2_33 = key_states_16[(Ellipsis, slice(32, None, None))] + key_states_16 = None + neg_33 = -x2_33 + x2_33 = None + cat_34 = torch.cat((neg_33, x1_33), dim=-1) + neg_33 = x1_33 = None + mul_152 = cat_34 * sin_19 + cat_34 = sin_19 = None + k_embed_16 = mul_151 + mul_152 + mul_151 = mul_152 = None + getitem_121 = k_embed_16[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_16 = None + hidden_states_163 = getitem_121.expand(1, 4, 8, 3, 64) + getitem_121 = None + key_32 = hidden_states_163.reshape(1, 32, 3, 64) + hidden_states_163 = None + getitem_122 = value_states_16[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_16 = None + hidden_states_164 = getitem_122.expand(1, 4, 8, 3, 64) + getitem_122 = None + value_32 = hidden_states_164.reshape(1, 32, 3, 64) + hidden_states_164 = None + attention_mask_17 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_16 = q_embed_16.contiguous() + q_embed_16 = None + key_33 = key_32.contiguous() + key_32 = None + value_33 = value_32.contiguous() + value_32 = None + attn_output_64 = torch._C._nn.scaled_dot_product_attention( + query_16, + key_33, + value_33, + attn_mask=attention_mask_17, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_16 = key_33 = value_33 = attention_mask_17 = None + transpose_68 = attn_output_64.transpose(1, 2) + attn_output_64 = None + attn_output_65 = transpose_68.contiguous() + transpose_68 = None + reshape_50 = attn_output_65.reshape(1, 3, -1) + attn_output_65 = None + attn_output_66 = reshape_50.contiguous() + reshape_50 = None + attn_output_67 = torch._C._nn.linear( + attn_output_66, + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_66 = l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_165 = hidden_states_159 + attn_output_67 + hidden_states_159 = attn_output_67 = None + hidden_states_166 = hidden_states_165.to(torch.float32) + pow_34 = hidden_states_166.pow(2) + variance_33 = pow_34.mean(-1, keepdim=True) + pow_34 = None + add_100 = variance_33 + 1e-05 + variance_33 = None + rsqrt_33 = torch.rsqrt(add_100) + add_100 = None + hidden_states_167 = hidden_states_166 * rsqrt_33 + hidden_states_166 = rsqrt_33 = None + to_71 = hidden_states_167.to(torch.float32) + hidden_states_167 = None + hidden_states_168 = ( + l_self_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + * to_71 + ) + l_self_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = ( + to_71 + ) = None + linear_116 = torch._C._nn.linear( + hidden_states_168, + l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_16 = torch.nn.functional.silu(linear_116, inplace=False) + linear_116 = None + linear_117 = torch._C._nn.linear( + hidden_states_168, + l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_168 = l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_155 = silu_16 * linear_117 + silu_16 = linear_117 = None + down_proj_16 = torch._C._nn.linear( + mul_155, + l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_155 = l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_169 = hidden_states_165 + down_proj_16 + hidden_states_165 = down_proj_16 = None + hidden_states_170 = hidden_states_169.to(torch.float32) + pow_35 = hidden_states_170.pow(2) + variance_34 = pow_35.mean(-1, keepdim=True) + pow_35 = None + add_102 = variance_34 + 1e-05 + variance_34 = None + rsqrt_34 = torch.rsqrt(add_102) + add_102 = None + hidden_states_171 = hidden_states_170 * rsqrt_34 + hidden_states_170 = rsqrt_34 = None + to_73 = hidden_states_171.to(torch.float32) + hidden_states_171 = None + hidden_states_172 = ( + l_self_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + * to_73 + ) + l_self_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + to_73 + ) = None + linear_119 = torch._C._nn.linear( + hidden_states_172, + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_52 = linear_119.view((1, 3, -1, 64)) + linear_119 = None + query_states_17 = view_52.transpose(1, 2) + view_52 = None + linear_120 = torch._C._nn.linear( + hidden_states_172, + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_53 = linear_120.view((1, 3, -1, 64)) + linear_120 = None + key_states_17 = view_53.transpose(1, 2) + view_53 = None + linear_121 = torch._C._nn.linear( + hidden_states_172, + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_172 = l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_54 = linear_121.view((1, 3, -1, 64)) + linear_121 = None + value_states_17 = view_54.transpose(1, 2) + view_54 = None + cos_20 = cos_2.unsqueeze(1) + sin_20 = sin_2.unsqueeze(1) + mul_158 = query_states_17 * cos_20 + x1_34 = query_states_17[(Ellipsis, slice(None, 32, None))] + x2_34 = query_states_17[(Ellipsis, slice(32, None, None))] + query_states_17 = None + neg_34 = -x2_34 + x2_34 = None + cat_35 = torch.cat((neg_34, x1_34), dim=-1) + neg_34 = x1_34 = None + mul_159 = cat_35 * sin_20 + cat_35 = None + q_embed_17 = mul_158 + mul_159 + mul_158 = mul_159 = None + mul_160 = key_states_17 * cos_20 + cos_20 = None + x1_35 = key_states_17[(Ellipsis, slice(None, 32, None))] + x2_35 = key_states_17[(Ellipsis, slice(32, None, None))] + key_states_17 = None + neg_35 = -x2_35 + x2_35 = None + cat_36 = torch.cat((neg_35, x1_35), dim=-1) + neg_35 = x1_35 = None + mul_161 = cat_36 * sin_20 + cat_36 = sin_20 = None + k_embed_17 = mul_160 + mul_161 + mul_160 = mul_161 = None + getitem_128 = k_embed_17[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_17 = None + hidden_states_173 = getitem_128.expand(1, 4, 8, 3, 64) + getitem_128 = None + key_34 = hidden_states_173.reshape(1, 32, 3, 64) + hidden_states_173 = None + getitem_129 = value_states_17[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_17 = None + hidden_states_174 = getitem_129.expand(1, 4, 8, 3, 64) + getitem_129 = None + value_34 = hidden_states_174.reshape(1, 32, 3, 64) + hidden_states_174 = None + attention_mask_18 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_17 = q_embed_17.contiguous() + q_embed_17 = None + key_35 = key_34.contiguous() + key_34 = None + value_35 = value_34.contiguous() + value_34 = None + attn_output_68 = torch._C._nn.scaled_dot_product_attention( + query_17, + key_35, + value_35, + attn_mask=attention_mask_18, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_17 = key_35 = value_35 = attention_mask_18 = None + transpose_72 = attn_output_68.transpose(1, 2) + attn_output_68 = None + attn_output_69 = transpose_72.contiguous() + transpose_72 = None + reshape_53 = attn_output_69.reshape(1, 3, -1) + attn_output_69 = None + attn_output_70 = reshape_53.contiguous() + reshape_53 = None + attn_output_71 = torch._C._nn.linear( + attn_output_70, + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_70 = l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_175 = hidden_states_169 + attn_output_71 + hidden_states_169 = attn_output_71 = None + hidden_states_176 = hidden_states_175.to(torch.float32) + pow_36 = hidden_states_176.pow(2) + variance_35 = pow_36.mean(-1, keepdim=True) + pow_36 = None + add_106 = variance_35 + 1e-05 + variance_35 = None + rsqrt_35 = torch.rsqrt(add_106) + add_106 = None + hidden_states_177 = hidden_states_176 * rsqrt_35 + hidden_states_176 = rsqrt_35 = None + to_75 = hidden_states_177.to(torch.float32) + hidden_states_177 = None + hidden_states_178 = ( + l_self_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + * to_75 + ) + l_self_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = ( + to_75 + ) = None + linear_123 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_17 = torch.nn.functional.silu(linear_123, inplace=False) + linear_123 = None + linear_124 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_178 = l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_164 = silu_17 * linear_124 + silu_17 = linear_124 = None + down_proj_17 = torch._C._nn.linear( + mul_164, + l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_164 = l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_179 = hidden_states_175 + down_proj_17 + hidden_states_175 = down_proj_17 = None + hidden_states_180 = hidden_states_179.to(torch.float32) + pow_37 = hidden_states_180.pow(2) + variance_36 = pow_37.mean(-1, keepdim=True) + pow_37 = None + add_108 = variance_36 + 1e-05 + variance_36 = None + rsqrt_36 = torch.rsqrt(add_108) + add_108 = None + hidden_states_181 = hidden_states_180 * rsqrt_36 + hidden_states_180 = rsqrt_36 = None + to_77 = hidden_states_181.to(torch.float32) + hidden_states_181 = None + hidden_states_182 = ( + l_self_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + * to_77 + ) + l_self_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + to_77 + ) = None + linear_126 = torch._C._nn.linear( + hidden_states_182, + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_55 = linear_126.view((1, 3, -1, 64)) + linear_126 = None + query_states_18 = view_55.transpose(1, 2) + view_55 = None + linear_127 = torch._C._nn.linear( + hidden_states_182, + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_56 = linear_127.view((1, 3, -1, 64)) + linear_127 = None + key_states_18 = view_56.transpose(1, 2) + view_56 = None + linear_128 = torch._C._nn.linear( + hidden_states_182, + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_182 = l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_57 = linear_128.view((1, 3, -1, 64)) + linear_128 = None + value_states_18 = view_57.transpose(1, 2) + view_57 = None + cos_21 = cos_2.unsqueeze(1) + sin_21 = sin_2.unsqueeze(1) + mul_167 = query_states_18 * cos_21 + x1_36 = query_states_18[(Ellipsis, slice(None, 32, None))] + x2_36 = query_states_18[(Ellipsis, slice(32, None, None))] + query_states_18 = None + neg_36 = -x2_36 + x2_36 = None + cat_37 = torch.cat((neg_36, x1_36), dim=-1) + neg_36 = x1_36 = None + mul_168 = cat_37 * sin_21 + cat_37 = None + q_embed_18 = mul_167 + mul_168 + mul_167 = mul_168 = None + mul_169 = key_states_18 * cos_21 + cos_21 = None + x1_37 = key_states_18[(Ellipsis, slice(None, 32, None))] + x2_37 = key_states_18[(Ellipsis, slice(32, None, None))] + key_states_18 = None + neg_37 = -x2_37 + x2_37 = None + cat_38 = torch.cat((neg_37, x1_37), dim=-1) + neg_37 = x1_37 = None + mul_170 = cat_38 * sin_21 + cat_38 = sin_21 = None + k_embed_18 = mul_169 + mul_170 + mul_169 = mul_170 = None + getitem_135 = k_embed_18[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_18 = None + hidden_states_183 = getitem_135.expand(1, 4, 8, 3, 64) + getitem_135 = None + key_36 = hidden_states_183.reshape(1, 32, 3, 64) + hidden_states_183 = None + getitem_136 = value_states_18[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_18 = None + hidden_states_184 = getitem_136.expand(1, 4, 8, 3, 64) + getitem_136 = None + value_36 = hidden_states_184.reshape(1, 32, 3, 64) + hidden_states_184 = None + attention_mask_19 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_18 = q_embed_18.contiguous() + q_embed_18 = None + key_37 = key_36.contiguous() + key_36 = None + value_37 = value_36.contiguous() + value_36 = None + attn_output_72 = torch._C._nn.scaled_dot_product_attention( + query_18, + key_37, + value_37, + attn_mask=attention_mask_19, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_18 = key_37 = value_37 = attention_mask_19 = None + transpose_76 = attn_output_72.transpose(1, 2) + attn_output_72 = None + attn_output_73 = transpose_76.contiguous() + transpose_76 = None + reshape_56 = attn_output_73.reshape(1, 3, -1) + attn_output_73 = None + attn_output_74 = reshape_56.contiguous() + reshape_56 = None + attn_output_75 = torch._C._nn.linear( + attn_output_74, + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_74 = l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_185 = hidden_states_179 + attn_output_75 + hidden_states_179 = attn_output_75 = None + hidden_states_186 = hidden_states_185.to(torch.float32) + pow_38 = hidden_states_186.pow(2) + variance_37 = pow_38.mean(-1, keepdim=True) + pow_38 = None + add_112 = variance_37 + 1e-05 + variance_37 = None + rsqrt_37 = torch.rsqrt(add_112) + add_112 = None + hidden_states_187 = hidden_states_186 * rsqrt_37 + hidden_states_186 = rsqrt_37 = None + to_79 = hidden_states_187.to(torch.float32) + hidden_states_187 = None + hidden_states_188 = ( + l_self_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + * to_79 + ) + l_self_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = ( + to_79 + ) = None + linear_130 = torch._C._nn.linear( + hidden_states_188, + l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_18 = torch.nn.functional.silu(linear_130, inplace=False) + linear_130 = None + linear_131 = torch._C._nn.linear( + hidden_states_188, + l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_188 = l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_173 = silu_18 * linear_131 + silu_18 = linear_131 = None + down_proj_18 = torch._C._nn.linear( + mul_173, + l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_173 = l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_189 = hidden_states_185 + down_proj_18 + hidden_states_185 = down_proj_18 = None + hidden_states_190 = hidden_states_189.to(torch.float32) + pow_39 = hidden_states_190.pow(2) + variance_38 = pow_39.mean(-1, keepdim=True) + pow_39 = None + add_114 = variance_38 + 1e-05 + variance_38 = None + rsqrt_38 = torch.rsqrt(add_114) + add_114 = None + hidden_states_191 = hidden_states_190 * rsqrt_38 + hidden_states_190 = rsqrt_38 = None + to_81 = hidden_states_191.to(torch.float32) + hidden_states_191 = None + hidden_states_192 = ( + l_self_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + * to_81 + ) + l_self_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + to_81 + ) = None + linear_133 = torch._C._nn.linear( + hidden_states_192, + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_58 = linear_133.view((1, 3, -1, 64)) + linear_133 = None + query_states_19 = view_58.transpose(1, 2) + view_58 = None + linear_134 = torch._C._nn.linear( + hidden_states_192, + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_59 = linear_134.view((1, 3, -1, 64)) + linear_134 = None + key_states_19 = view_59.transpose(1, 2) + view_59 = None + linear_135 = torch._C._nn.linear( + hidden_states_192, + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_192 = l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_60 = linear_135.view((1, 3, -1, 64)) + linear_135 = None + value_states_19 = view_60.transpose(1, 2) + view_60 = None + cos_22 = cos_2.unsqueeze(1) + sin_22 = sin_2.unsqueeze(1) + mul_176 = query_states_19 * cos_22 + x1_38 = query_states_19[(Ellipsis, slice(None, 32, None))] + x2_38 = query_states_19[(Ellipsis, slice(32, None, None))] + query_states_19 = None + neg_38 = -x2_38 + x2_38 = None + cat_39 = torch.cat((neg_38, x1_38), dim=-1) + neg_38 = x1_38 = None + mul_177 = cat_39 * sin_22 + cat_39 = None + q_embed_19 = mul_176 + mul_177 + mul_176 = mul_177 = None + mul_178 = key_states_19 * cos_22 + cos_22 = None + x1_39 = key_states_19[(Ellipsis, slice(None, 32, None))] + x2_39 = key_states_19[(Ellipsis, slice(32, None, None))] + key_states_19 = None + neg_39 = -x2_39 + x2_39 = None + cat_40 = torch.cat((neg_39, x1_39), dim=-1) + neg_39 = x1_39 = None + mul_179 = cat_40 * sin_22 + cat_40 = sin_22 = None + k_embed_19 = mul_178 + mul_179 + mul_178 = mul_179 = None + getitem_142 = k_embed_19[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_19 = None + hidden_states_193 = getitem_142.expand(1, 4, 8, 3, 64) + getitem_142 = None + key_38 = hidden_states_193.reshape(1, 32, 3, 64) + hidden_states_193 = None + getitem_143 = value_states_19[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_19 = None + hidden_states_194 = getitem_143.expand(1, 4, 8, 3, 64) + getitem_143 = None + value_38 = hidden_states_194.reshape(1, 32, 3, 64) + hidden_states_194 = None + attention_mask_20 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_19 = q_embed_19.contiguous() + q_embed_19 = None + key_39 = key_38.contiguous() + key_38 = None + value_39 = value_38.contiguous() + value_38 = None + attn_output_76 = torch._C._nn.scaled_dot_product_attention( + query_19, + key_39, + value_39, + attn_mask=attention_mask_20, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_19 = key_39 = value_39 = attention_mask_20 = None + transpose_80 = attn_output_76.transpose(1, 2) + attn_output_76 = None + attn_output_77 = transpose_80.contiguous() + transpose_80 = None + reshape_59 = attn_output_77.reshape(1, 3, -1) + attn_output_77 = None + attn_output_78 = reshape_59.contiguous() + reshape_59 = None + attn_output_79 = torch._C._nn.linear( + attn_output_78, + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_78 = l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_195 = hidden_states_189 + attn_output_79 + hidden_states_189 = attn_output_79 = None + hidden_states_196 = hidden_states_195.to(torch.float32) + pow_40 = hidden_states_196.pow(2) + variance_39 = pow_40.mean(-1, keepdim=True) + pow_40 = None + add_118 = variance_39 + 1e-05 + variance_39 = None + rsqrt_39 = torch.rsqrt(add_118) + add_118 = None + hidden_states_197 = hidden_states_196 * rsqrt_39 + hidden_states_196 = rsqrt_39 = None + to_83 = hidden_states_197.to(torch.float32) + hidden_states_197 = None + hidden_states_198 = ( + l_self_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + * to_83 + ) + l_self_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = ( + to_83 + ) = None + linear_137 = torch._C._nn.linear( + hidden_states_198, + l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_19 = torch.nn.functional.silu(linear_137, inplace=False) + linear_137 = None + linear_138 = torch._C._nn.linear( + hidden_states_198, + l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_198 = l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_182 = silu_19 * linear_138 + silu_19 = linear_138 = None + down_proj_19 = torch._C._nn.linear( + mul_182, + l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_182 = l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_199 = hidden_states_195 + down_proj_19 + hidden_states_195 = down_proj_19 = None + hidden_states_200 = hidden_states_199.to(torch.float32) + pow_41 = hidden_states_200.pow(2) + variance_40 = pow_41.mean(-1, keepdim=True) + pow_41 = None + add_120 = variance_40 + 1e-05 + variance_40 = None + rsqrt_40 = torch.rsqrt(add_120) + add_120 = None + hidden_states_201 = hidden_states_200 * rsqrt_40 + hidden_states_200 = rsqrt_40 = None + to_85 = hidden_states_201.to(torch.float32) + hidden_states_201 = None + hidden_states_202 = ( + l_self_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + * to_85 + ) + l_self_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + to_85 + ) = None + linear_140 = torch._C._nn.linear( + hidden_states_202, + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_61 = linear_140.view((1, 3, -1, 64)) + linear_140 = None + query_states_20 = view_61.transpose(1, 2) + view_61 = None + linear_141 = torch._C._nn.linear( + hidden_states_202, + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_62 = linear_141.view((1, 3, -1, 64)) + linear_141 = None + key_states_20 = view_62.transpose(1, 2) + view_62 = None + linear_142 = torch._C._nn.linear( + hidden_states_202, + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_202 = l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_63 = linear_142.view((1, 3, -1, 64)) + linear_142 = None + value_states_20 = view_63.transpose(1, 2) + view_63 = None + cos_23 = cos_2.unsqueeze(1) + sin_23 = sin_2.unsqueeze(1) + mul_185 = query_states_20 * cos_23 + x1_40 = query_states_20[(Ellipsis, slice(None, 32, None))] + x2_40 = query_states_20[(Ellipsis, slice(32, None, None))] + query_states_20 = None + neg_40 = -x2_40 + x2_40 = None + cat_41 = torch.cat((neg_40, x1_40), dim=-1) + neg_40 = x1_40 = None + mul_186 = cat_41 * sin_23 + cat_41 = None + q_embed_20 = mul_185 + mul_186 + mul_185 = mul_186 = None + mul_187 = key_states_20 * cos_23 + cos_23 = None + x1_41 = key_states_20[(Ellipsis, slice(None, 32, None))] + x2_41 = key_states_20[(Ellipsis, slice(32, None, None))] + key_states_20 = None + neg_41 = -x2_41 + x2_41 = None + cat_42 = torch.cat((neg_41, x1_41), dim=-1) + neg_41 = x1_41 = None + mul_188 = cat_42 * sin_23 + cat_42 = sin_23 = None + k_embed_20 = mul_187 + mul_188 + mul_187 = mul_188 = None + getitem_149 = k_embed_20[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_20 = None + hidden_states_203 = getitem_149.expand(1, 4, 8, 3, 64) + getitem_149 = None + key_40 = hidden_states_203.reshape(1, 32, 3, 64) + hidden_states_203 = None + getitem_150 = value_states_20[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_20 = None + hidden_states_204 = getitem_150.expand(1, 4, 8, 3, 64) + getitem_150 = None + value_40 = hidden_states_204.reshape(1, 32, 3, 64) + hidden_states_204 = None + attention_mask_21 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + query_20 = q_embed_20.contiguous() + q_embed_20 = None + key_41 = key_40.contiguous() + key_40 = None + value_41 = value_40.contiguous() + value_40 = None + attn_output_80 = torch._C._nn.scaled_dot_product_attention( + query_20, + key_41, + value_41, + attn_mask=attention_mask_21, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_20 = key_41 = value_41 = attention_mask_21 = None + transpose_84 = attn_output_80.transpose(1, 2) + attn_output_80 = None + attn_output_81 = transpose_84.contiguous() + transpose_84 = None + reshape_62 = attn_output_81.reshape(1, 3, -1) + attn_output_81 = None + attn_output_82 = reshape_62.contiguous() + reshape_62 = None + attn_output_83 = torch._C._nn.linear( + attn_output_82, + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_82 = l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_205 = hidden_states_199 + attn_output_83 + hidden_states_199 = attn_output_83 = None + hidden_states_206 = hidden_states_205.to(torch.float32) + pow_42 = hidden_states_206.pow(2) + variance_41 = pow_42.mean(-1, keepdim=True) + pow_42 = None + add_124 = variance_41 + 1e-05 + variance_41 = None + rsqrt_41 = torch.rsqrt(add_124) + add_124 = None + hidden_states_207 = hidden_states_206 * rsqrt_41 + hidden_states_206 = rsqrt_41 = None + to_87 = hidden_states_207.to(torch.float32) + hidden_states_207 = None + hidden_states_208 = ( + l_self_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + * to_87 + ) + l_self_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = ( + to_87 + ) = None + linear_144 = torch._C._nn.linear( + hidden_states_208, + l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_20 = torch.nn.functional.silu(linear_144, inplace=False) + linear_144 = None + linear_145 = torch._C._nn.linear( + hidden_states_208, + l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_208 = l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_191 = silu_20 * linear_145 + silu_20 = linear_145 = None + down_proj_20 = torch._C._nn.linear( + mul_191, + l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_191 = l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_209 = hidden_states_205 + down_proj_20 + hidden_states_205 = down_proj_20 = None + hidden_states_210 = hidden_states_209.to(torch.float32) + pow_43 = hidden_states_210.pow(2) + variance_42 = pow_43.mean(-1, keepdim=True) + pow_43 = None + add_126 = variance_42 + 1e-05 + variance_42 = None + rsqrt_42 = torch.rsqrt(add_126) + add_126 = None + hidden_states_211 = hidden_states_210 * rsqrt_42 + hidden_states_210 = rsqrt_42 = None + to_89 = hidden_states_211.to(torch.float32) + hidden_states_211 = None + hidden_states_212 = ( + l_self_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + * to_89 + ) + l_self_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + to_89 + ) = None + linear_147 = torch._C._nn.linear( + hidden_states_212, + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ = ( + None + ) + view_64 = linear_147.view((1, 3, -1, 64)) + linear_147 = None + query_states_21 = view_64.transpose(1, 2) + view_64 = None + linear_148 = torch._C._nn.linear( + hidden_states_212, + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ = ( + None + ) + view_65 = linear_148.view((1, 3, -1, 64)) + linear_148 = None + key_states_21 = view_65.transpose(1, 2) + view_65 = None + linear_149 = torch._C._nn.linear( + hidden_states_212, + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_, + None, + ) + hidden_states_212 = l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ = (None) + view_66 = linear_149.view((1, 3, -1, 64)) + linear_149 = None + value_states_21 = view_66.transpose(1, 2) + view_66 = None + cos_24 = cos_2.unsqueeze(1) + cos_2 = None + sin_24 = sin_2.unsqueeze(1) + sin_2 = None + mul_194 = query_states_21 * cos_24 + x1_42 = query_states_21[(Ellipsis, slice(None, 32, None))] + x2_42 = query_states_21[(Ellipsis, slice(32, None, None))] + query_states_21 = None + neg_42 = -x2_42 + x2_42 = None + cat_43 = torch.cat((neg_42, x1_42), dim=-1) + neg_42 = x1_42 = None + mul_195 = cat_43 * sin_24 + cat_43 = None + q_embed_21 = mul_194 + mul_195 + mul_194 = mul_195 = None + mul_196 = key_states_21 * cos_24 + cos_24 = None + x1_43 = key_states_21[(Ellipsis, slice(None, 32, None))] + x2_43 = key_states_21[(Ellipsis, slice(32, None, None))] + key_states_21 = None + neg_43 = -x2_43 + x2_43 = None + cat_44 = torch.cat((neg_43, x1_43), dim=-1) + neg_43 = x1_43 = None + mul_197 = cat_44 * sin_24 + cat_44 = sin_24 = None + k_embed_21 = mul_196 + mul_197 + mul_196 = mul_197 = None + getitem_156 = k_embed_21[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + k_embed_21 = None + hidden_states_213 = getitem_156.expand(1, 4, 8, 3, 64) + getitem_156 = None + key_42 = hidden_states_213.reshape(1, 32, 3, 64) + hidden_states_213 = None + getitem_157 = value_states_21[ + ( + slice(None, None, None), + slice(None, None, None), + None, + slice(None, None, None), + slice(None, None, None), + ) + ] + value_states_21 = None + hidden_states_214 = getitem_157.expand(1, 4, 8, 3, 64) + getitem_157 = None + value_42 = hidden_states_214.reshape(1, 32, 3, 64) + hidden_states_214 = None + attention_mask_22 = causal_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 3, None), + ) + ] + causal_mask_2 = None + query_21 = q_embed_21.contiguous() + q_embed_21 = None + key_43 = key_42.contiguous() + key_42 = None + value_43 = value_42.contiguous() + value_42 = None + attn_output_84 = torch._C._nn.scaled_dot_product_attention( + query_21, + key_43, + value_43, + attn_mask=attention_mask_22, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + query_21 = key_43 = value_43 = attention_mask_22 = None + transpose_88 = attn_output_84.transpose(1, 2) + attn_output_84 = None + attn_output_85 = transpose_88.contiguous() + transpose_88 = None + reshape_65 = attn_output_85.reshape(1, 3, -1) + attn_output_85 = None + attn_output_86 = reshape_65.contiguous() + reshape_65 = None + attn_output_87 = torch._C._nn.linear( + attn_output_86, + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_86 = l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_215 = hidden_states_209 + attn_output_87 + hidden_states_209 = attn_output_87 = None + hidden_states_216 = hidden_states_215.to(torch.float32) + pow_44 = hidden_states_216.pow(2) + variance_43 = pow_44.mean(-1, keepdim=True) + pow_44 = None + add_130 = variance_43 + 1e-05 + variance_43 = None + rsqrt_43 = torch.rsqrt(add_130) + add_130 = None + hidden_states_217 = hidden_states_216 * rsqrt_43 + hidden_states_216 = rsqrt_43 = None + to_91 = hidden_states_217.to(torch.float32) + hidden_states_217 = None + hidden_states_218 = ( + l_self_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + * to_91 + ) + l_self_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = ( + to_91 + ) = None + linear_151 = torch._C._nn.linear( + hidden_states_218, + l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_21 = torch.nn.functional.silu(linear_151, inplace=False) + linear_151 = None + linear_152 = torch._C._nn.linear( + hidden_states_218, + l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_218 = l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_200 = silu_21 * linear_152 + silu_21 = linear_152 = None + down_proj_21 = torch._C._nn.linear( + mul_200, + l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_200 = l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_219 = hidden_states_215 + down_proj_21 + hidden_states_215 = down_proj_21 = None + hidden_states_220 = hidden_states_219.to(torch.float32) + hidden_states_219 = None + pow_45 = hidden_states_220.pow(2) + variance_44 = pow_45.mean(-1, keepdim=True) + pow_45 = None + add_132 = variance_44 + 1e-05 + variance_44 = None + rsqrt_44 = torch.rsqrt(add_132) + add_132 = None + hidden_states_221 = hidden_states_220 * rsqrt_44 + hidden_states_220 = rsqrt_44 = None + to_93 = hidden_states_221.to(torch.float32) + hidden_states_221 = None + hidden_states_222 = l_self_modules_model_modules_norm_parameters_weight_ * to_93 + l_self_modules_model_modules_norm_parameters_weight_ = to_93 = None + return (hidden_states_222,) diff --git a/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/weight_meta.py b/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/weight_meta.py new file mode 100644 index 000000000..62300c932 --- /dev/null +++ b/samples/transformers-auto-model/TinyLlama/TinyLlama-1.1B-Chat-v0.4/weight_meta.py @@ -0,0 +1,2061 @@ +class Program_weight_tensor_meta_L_kwargs_input_ids_: + name = "L_kwargs_input_ids_" + shape = [1, 3] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 15043, 3186] + + +class Program_weight_tensor_meta_L_self_modules_model_modules_embed_tokens_parameters_weight_: + name = "L_self_modules_model_modules_embed_tokens_parameters_weight_" + shape = [32003, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_kwargs_attention_mask_: + name = "L_kwargs_attention_mask_" + shape = [1, 3] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1] + + +class Program_weight_tensor_meta_L_self_modules_model_modules_rotary_emb_buffers_inv_freq_: + name = "L_self_modules_model_modules_rotary_emb_buffers_inv_freq_" + shape = [32] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.125 + std = 0.240 + data = [ + 1.000000, + 0.749894, + 0.562341, + 0.421697, + 0.316228, + 0.237137, + 0.177828, + 0.133352, + 0.100000, + 0.074989, + 0.056234, + 0.042170, + 0.031623, + 0.023714, + 0.017783, + 0.013335, + 0.010000, + 0.007499, + 0.005623, + 0.004217, + 0.003162, + 0.002371, + 0.001778, + 0.001334, + 0.001000, + 0.000750, + 0.000562, + 0.000422, + 0.000316, + 0.000237, + 0.000178, + 0.000133, + ] + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [256, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [2048, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_" + shape = [5632, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_" + shape = [2048, 5632] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_norm_parameters_weight_: + name = "L_self_modules_model_modules_norm_parameters_weight_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/bigscience/bloom-560m/graph_hash.txt b/samples/transformers-auto-model/bigscience/bloom-560m/graph_hash.txt new file mode 100644 index 000000000..f4fbc8aa7 --- /dev/null +++ b/samples/transformers-auto-model/bigscience/bloom-560m/graph_hash.txt @@ -0,0 +1 @@ +707b9b5bcb29624e52322253ddd34db23cd8a07b69a396e398a01cd20c3e3203 \ No newline at end of file diff --git a/samples/transformers-auto-model/bigscience/bloom-560m/graph_net.json b/samples/transformers-auto-model/bigscience/bloom-560m/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/bigscience/bloom-560m/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/bigscience/bloom-560m/input_meta.py b/samples/transformers-auto-model/bigscience/bloom-560m/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/bigscience/bloom-560m/input_tensor_constraints.py b/samples/transformers-auto-model/bigscience/bloom-560m/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/bigscience/bloom-560m/model.py b/samples/transformers-auto-model/bigscience/bloom-560m/model.py new file mode 100644 index 000000000..a411957fe --- /dev/null +++ b/samples/transformers-auto-model/bigscience/bloom-560m/model.py @@ -0,0 +1,3799 @@ +import torch + +from torch import device + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_inputs_embeds_: torch.Tensor, + L_self_modules_word_embeddings_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_word_embeddings_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_attention_mask_: torch.Tensor, + L_self_modules_h_modules_0_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_12_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_12_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_12_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_12_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_12_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_12_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_12_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_12_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_13_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_13_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_13_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_13_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_13_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_13_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_13_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_13_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_14_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_14_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_14_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_14_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_14_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_14_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_14_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_14_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_15_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_15_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_15_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_15_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_15_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_15_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_15_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_15_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_16_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_16_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_16_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_16_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_16_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_16_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_16_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_16_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_16_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_16_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_16_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_16_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_17_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_17_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_17_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_17_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_17_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_17_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_17_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_17_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_17_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_17_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_17_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_17_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_18_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_18_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_18_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_18_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_18_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_18_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_18_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_18_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_18_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_18_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_18_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_18_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_19_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_19_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_19_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_19_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_19_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_19_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_19_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_19_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_19_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_19_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_19_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_19_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_20_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_20_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_20_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_20_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_20_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_20_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_20_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_20_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_20_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_20_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_20_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_20_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_21_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_21_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_21_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_21_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_21_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_21_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_21_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_21_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_21_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_21_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_21_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_21_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_22_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_22_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_22_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_22_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_22_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_22_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_22_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_22_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_22_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_22_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_22_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_22_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_23_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_23_modules_input_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_23_modules_self_attention_modules_query_key_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_23_modules_self_attention_modules_query_key_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_23_modules_self_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_23_modules_self_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_23_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_23_modules_post_attention_layernorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_23_modules_mlp_modules_dense_h_to_4h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_23_modules_mlp_modules_dense_h_to_4h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_23_modules_mlp_modules_dense_4h_to_h_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_h_modules_23_modules_mlp_modules_dense_4h_to_h_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_ln_f_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_ln_f_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_inputs_embeds_ = L_inputs_embeds_ + l_self_modules_word_embeddings_layernorm_parameters_weight_ = ( + L_self_modules_word_embeddings_layernorm_parameters_weight_ + ) + l_self_modules_word_embeddings_layernorm_parameters_bias_ = ( + L_self_modules_word_embeddings_layernorm_parameters_bias_ + ) + l_attention_mask_ = L_attention_mask_ + l_self_modules_h_modules_0_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_0_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_0_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_0_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_0_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_0_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_0_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_0_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_0_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_0_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_0_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_0_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_0_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_0_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_0_modules_post_attention_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_0_modules_post_attention_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_1_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_1_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_1_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_1_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_1_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_1_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_1_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_1_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_1_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_1_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_1_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_1_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_1_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_1_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_1_modules_post_attention_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_1_modules_post_attention_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_2_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_2_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_2_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_2_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_2_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_2_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_2_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_2_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_2_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_2_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_2_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_2_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_2_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_2_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_2_modules_post_attention_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_2_modules_post_attention_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_3_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_3_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_3_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_3_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_3_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_3_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_3_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_3_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_3_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_3_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_3_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_3_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_3_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_3_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_3_modules_post_attention_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_3_modules_post_attention_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_4_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_4_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_4_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_4_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_4_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_4_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_4_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_4_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_4_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_4_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_4_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_4_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_4_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_4_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_4_modules_post_attention_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_4_modules_post_attention_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_5_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_5_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_5_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_5_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_5_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_5_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_5_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_5_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_5_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_5_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_5_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_5_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_5_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_5_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_5_modules_post_attention_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_5_modules_post_attention_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_6_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_6_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_6_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_6_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_6_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_6_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_6_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_6_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_6_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_6_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_6_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_6_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_6_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_6_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_6_modules_post_attention_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_6_modules_post_attention_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_7_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_7_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_7_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_7_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_7_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_7_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_7_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_7_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_7_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_7_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_7_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_7_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_7_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_7_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_7_modules_post_attention_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_7_modules_post_attention_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_8_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_8_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_8_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_8_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_8_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_8_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_8_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_8_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_8_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_8_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_8_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_8_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_8_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_8_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_8_modules_post_attention_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_8_modules_post_attention_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_9_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_9_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_9_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_9_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_9_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_9_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_9_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_9_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_9_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_9_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_9_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_9_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_9_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_9_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_9_modules_post_attention_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_9_modules_post_attention_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_10_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_10_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_10_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_10_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_10_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_10_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_10_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_10_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_10_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_10_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_10_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_10_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_10_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_10_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_10_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_h_modules_10_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_h_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_11_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_11_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_11_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_11_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_11_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_11_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_11_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_11_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_11_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_11_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_11_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_11_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_11_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_11_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_11_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_h_modules_11_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_h_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_12_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_12_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_12_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_12_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_12_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_12_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_12_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_12_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_12_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_12_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_12_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_12_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_12_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_12_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_12_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_h_modules_12_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_h_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_13_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_13_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_13_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_13_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_13_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_13_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_13_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_13_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_13_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_13_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_13_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_13_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_13_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_13_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_13_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_h_modules_13_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_h_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_14_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_14_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_14_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_14_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_14_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_14_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_14_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_14_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_14_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_14_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_14_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_14_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_14_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_14_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_14_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_h_modules_14_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_h_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_15_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_15_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_15_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_15_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_15_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_15_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_15_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_15_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_15_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_15_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_15_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_15_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_15_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_15_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_15_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_h_modules_15_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_h_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_16_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_16_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_16_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_16_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_16_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_16_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_16_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_16_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_16_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_16_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_16_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_16_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_16_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_16_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_16_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_h_modules_16_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_h_modules_16_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_16_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_16_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_16_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_16_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_16_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_16_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_16_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_17_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_17_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_17_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_17_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_17_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_17_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_17_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_17_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_17_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_17_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_17_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_17_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_17_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_17_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_17_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_h_modules_17_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_h_modules_17_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_17_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_17_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_17_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_17_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_17_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_17_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_17_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_18_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_18_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_18_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_18_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_18_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_18_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_18_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_18_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_18_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_18_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_18_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_18_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_18_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_18_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_18_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_h_modules_18_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_h_modules_18_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_18_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_18_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_18_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_18_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_18_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_18_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_18_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_19_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_19_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_19_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_19_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_19_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_19_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_19_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_19_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_19_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_19_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_19_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_19_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_19_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_19_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_19_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_h_modules_19_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_h_modules_19_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_19_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_19_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_19_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_19_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_19_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_19_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_19_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_20_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_20_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_20_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_20_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_20_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_20_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_20_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_20_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_20_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_20_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_20_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_20_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_20_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_20_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_20_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_h_modules_20_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_h_modules_20_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_20_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_20_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_20_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_20_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_20_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_20_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_20_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_21_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_21_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_21_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_21_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_21_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_21_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_21_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_21_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_21_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_21_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_21_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_21_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_21_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_21_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_21_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_h_modules_21_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_h_modules_21_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_21_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_21_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_21_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_21_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_21_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_21_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_21_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_22_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_22_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_22_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_22_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_22_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_22_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_22_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_22_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_22_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_22_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_22_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_22_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_22_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_22_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_22_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_h_modules_22_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_h_modules_22_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_22_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_22_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_22_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_22_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_22_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_22_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_22_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_h_modules_23_modules_input_layernorm_parameters_weight_ = ( + L_self_modules_h_modules_23_modules_input_layernorm_parameters_weight_ + ) + l_self_modules_h_modules_23_modules_input_layernorm_parameters_bias_ = ( + L_self_modules_h_modules_23_modules_input_layernorm_parameters_bias_ + ) + l_self_modules_h_modules_23_modules_self_attention_modules_query_key_value_parameters_weight_ = L_self_modules_h_modules_23_modules_self_attention_modules_query_key_value_parameters_weight_ + l_self_modules_h_modules_23_modules_self_attention_modules_query_key_value_parameters_bias_ = L_self_modules_h_modules_23_modules_self_attention_modules_query_key_value_parameters_bias_ + l_self_modules_h_modules_23_modules_self_attention_modules_dense_parameters_weight_ = L_self_modules_h_modules_23_modules_self_attention_modules_dense_parameters_weight_ + l_self_modules_h_modules_23_modules_self_attention_modules_dense_parameters_bias_ = L_self_modules_h_modules_23_modules_self_attention_modules_dense_parameters_bias_ + l_self_modules_h_modules_23_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_h_modules_23_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_h_modules_23_modules_post_attention_layernorm_parameters_bias_ = L_self_modules_h_modules_23_modules_post_attention_layernorm_parameters_bias_ + l_self_modules_h_modules_23_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = L_self_modules_h_modules_23_modules_mlp_modules_dense_h_to_4h_parameters_weight_ + l_self_modules_h_modules_23_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = L_self_modules_h_modules_23_modules_mlp_modules_dense_h_to_4h_parameters_bias_ + l_self_modules_h_modules_23_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = L_self_modules_h_modules_23_modules_mlp_modules_dense_4h_to_h_parameters_weight_ + l_self_modules_h_modules_23_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = L_self_modules_h_modules_23_modules_mlp_modules_dense_4h_to_h_parameters_bias_ + l_self_modules_ln_f_parameters_weight_ = L_self_modules_ln_f_parameters_weight_ + l_self_modules_ln_f_parameters_bias_ = L_self_modules_ln_f_parameters_bias_ + cache_position = torch.arange(0, 2, device=device(type="cuda", index=0)) + hidden_states = torch.nn.functional.layer_norm( + l_inputs_embeds_, + (1024,), + l_self_modules_word_embeddings_layernorm_parameters_weight_, + l_self_modules_word_embeddings_layernorm_parameters_bias_, + 1e-05, + ) + l_inputs_embeds_ = ( + l_self_modules_word_embeddings_layernorm_parameters_weight_ + ) = l_self_modules_word_embeddings_layernorm_parameters_bias_ = None + attention_mask = l_attention_mask_.to(device(type="cuda", index=0)) + l_attention_mask_ = None + base = torch.tensor( + 0.7071067811865476, device=device(type="cuda", index=0), dtype=torch.float32 + ) + powers = torch.arange( + 1, 17, device=device(type="cuda", index=0), dtype=torch.int32 + ) + slopes = torch.pow(base, powers) + base = powers = None + cumsum = attention_mask.cumsum(dim=-1) + sub = cumsum - 1 + cumsum = None + mul = sub * attention_mask + sub = None + arange_tensor = mul[(slice(None, None, None), None, slice(None, None, None))] + mul = None + getitem_1 = slopes[(Ellipsis, None)] + slopes = None + alibi = getitem_1 * arange_tensor + getitem_1 = arange_tensor = None + reshape = alibi.reshape(16, 1, 2) + alibi = None + alibi_1 = reshape.to(torch.float32) + reshape = None + causal_mask = torch.full( + (2, 2), + fill_value=-3.4028234663852886e38, + dtype=torch.float32, + device=device(type="cuda", index=0), + ) + causal_mask_1 = torch.triu(causal_mask, diagonal=1) + causal_mask = None + arange_2 = torch.arange(2, device=device(type="cuda", index=0)) + reshape_1 = cache_position.reshape(-1, 1) + cache_position = None + gt = arange_2 > reshape_1 + arange_2 = reshape_1 = None + causal_mask_1 *= gt + causal_mask_2 = causal_mask_1 + causal_mask_1 = gt = None + getitem_2 = causal_mask_2[ + (None, None, slice(None, None, None), slice(None, None, None)) + ] + causal_mask_2 = None + causal_mask_3 = getitem_2.expand(1, 1, -1, -1) + getitem_2 = None + causal_mask_4 = causal_mask_3.clone() + causal_mask_3 = None + getitem_3 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + getitem_4 = attention_mask[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + attention_mask = None + to_2 = getitem_4.to(device(type="cuda", index=0)) + getitem_4 = None + padding_mask = getitem_3 + to_2 + getitem_3 = to_2 = None + padding_mask_1 = padding_mask.__eq__(0) + padding_mask = None + getitem_5 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + masked_fill = getitem_5.masked_fill(padding_mask_1, -3.4028234663852886e38) + getitem_5 = padding_mask_1 = None + causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] = masked_fill + setitem = causal_mask_4 + masked_fill = setitem = None + layernorm_output = torch.nn.functional.layer_norm( + hidden_states, + (1024,), + l_self_modules_h_modules_0_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_0_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_0_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_0_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv = torch._C._nn.linear( + layernorm_output, + l_self_modules_h_modules_0_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_0_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output = l_self_modules_h_modules_0_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_0_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_1 = fused_qkv.view(1, 2, 16, 3, 64) + fused_qkv = None + getitem_6 = fused_qkv_1[(Ellipsis, 0, slice(None, None, None))] + query_layer = getitem_6.transpose(1, 2) + getitem_6 = None + getitem_7 = fused_qkv_1[(Ellipsis, 1, slice(None, None, None))] + key_layer = getitem_7.transpose(1, 2) + getitem_7 = None + getitem_8 = fused_qkv_1[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_1 = None + value_layer = getitem_8.transpose(1, 2) + getitem_8 = None + query_layer_1 = query_layer.reshape(16, -1, 64) + query_layer = None + reshape_3 = key_layer.reshape(16, -1, 64) + key_layer_1 = reshape_3.transpose(-1, -2) + reshape_3 = None + value_layer_1 = value_layer.reshape(16, -1, 64) + attention_scores = alibi_1.baddbmm( + batch1=query_layer_1, batch2=key_layer_1, beta=1.0, alpha=0.125 + ) + query_layer_1 = key_layer_1 = None + attn_weights = attention_scores.view(1, 16, 2, -1) + attention_scores = None + causal_mask_5 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_1 = attn_weights + causal_mask_5 + attn_weights = causal_mask_5 = None + softmax = torch.nn.functional.softmax( + attn_weights_1, dim=-1, dtype=torch.float32 + ) + attn_weights_1 = None + attention_probs = softmax.to(torch.float32) + softmax = None + attention_probs_1 = torch.nn.functional.dropout( + attention_probs, 0.0, False, False + ) + attention_probs = None + attention_probs_reshaped = attention_probs_1.view(16, 2, -1) + attention_probs_1 = None + context_layer = torch.bmm(attention_probs_reshaped, value_layer_1) + attention_probs_reshaped = value_layer_1 = None + x = context_layer.view(1, 16, 2, 64) + context_layer = None + x_1 = x.permute(0, 2, 1, 3) + x = None + context_layer_1 = x_1.reshape(1, 2, 1024) + x_1 = None + output_tensor = torch._C._nn.linear( + context_layer_1, + l_self_modules_h_modules_0_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_0_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_1 = l_self_modules_h_modules_0_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_0_modules_self_attention_modules_dense_parameters_bias_ = (None) + out = torch.nn.functional.dropout(output_tensor, p=0.0, training=False) + output_tensor = None + out_1 = hidden_states + out + hidden_states = out = None + layernorm_output_1 = torch.nn.functional.layer_norm( + out_1, + (1024,), + l_self_modules_h_modules_0_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_0_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_0_modules_post_attention_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_0_modules_post_attention_layernorm_parameters_bias_ + ) = None + linear_2 = torch._C._nn.linear( + layernorm_output_1, + l_self_modules_h_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_1 = l_self_modules_h_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_2 = linear_2 * 0.5 + mul_3 = 0.79788456 * linear_2 + mul_4 = 0.044715 * linear_2 + mul_5 = mul_4 * linear_2 + mul_4 = linear_2 = None + add_3 = 1 + mul_5 + mul_5 = None + mul_6 = mul_3 * add_3 + mul_3 = add_3 = None + tanh = torch.tanh(mul_6) + mul_6 = None + add_4 = 1.0 + tanh + tanh = None + hidden_states_1 = mul_2 * add_4 + mul_2 = add_4 = None + intermediate_output = torch._C._nn.linear( + hidden_states_1, + l_self_modules_h_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_1 = l_self_modules_h_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_2 = torch.nn.functional.dropout(intermediate_output, p=0.0, training=False) + intermediate_output = None + out_3 = out_1 + out_2 + out_1 = out_2 = None + layernorm_output_2 = torch.nn.functional.layer_norm( + out_3, + (1024,), + l_self_modules_h_modules_1_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_1_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_1_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_1_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_2 = torch._C._nn.linear( + layernorm_output_2, + l_self_modules_h_modules_1_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_1_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_2 = l_self_modules_h_modules_1_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_1_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_3 = fused_qkv_2.view(1, 2, 16, 3, 64) + fused_qkv_2 = None + getitem_10 = fused_qkv_3[(Ellipsis, 0, slice(None, None, None))] + query_layer_2 = getitem_10.transpose(1, 2) + getitem_10 = None + getitem_11 = fused_qkv_3[(Ellipsis, 1, slice(None, None, None))] + key_layer_2 = getitem_11.transpose(1, 2) + getitem_11 = None + getitem_12 = fused_qkv_3[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_3 = None + value_layer_2 = getitem_12.transpose(1, 2) + getitem_12 = None + query_layer_3 = query_layer_2.reshape(16, -1, 64) + query_layer_2 = None + reshape_7 = key_layer_2.reshape(16, -1, 64) + key_layer_3 = reshape_7.transpose(-1, -2) + reshape_7 = None + value_layer_3 = value_layer_2.reshape(16, -1, 64) + attention_scores_1 = alibi_1.baddbmm( + batch1=query_layer_3, batch2=key_layer_3, beta=1.0, alpha=0.125 + ) + query_layer_3 = key_layer_3 = None + attn_weights_2 = attention_scores_1.view(1, 16, 2, -1) + attention_scores_1 = None + causal_mask_6 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_3 = attn_weights_2 + causal_mask_6 + attn_weights_2 = causal_mask_6 = None + softmax_1 = torch.nn.functional.softmax( + attn_weights_3, dim=-1, dtype=torch.float32 + ) + attn_weights_3 = None + attention_probs_2 = softmax_1.to(torch.float32) + softmax_1 = None + attention_probs_3 = torch.nn.functional.dropout( + attention_probs_2, 0.0, False, False + ) + attention_probs_2 = None + attention_probs_reshaped_1 = attention_probs_3.view(16, 2, -1) + attention_probs_3 = None + context_layer_2 = torch.bmm(attention_probs_reshaped_1, value_layer_3) + attention_probs_reshaped_1 = value_layer_3 = None + x_2 = context_layer_2.view(1, 16, 2, 64) + context_layer_2 = None + x_3 = x_2.permute(0, 2, 1, 3) + x_2 = None + context_layer_3 = x_3.reshape(1, 2, 1024) + x_3 = None + output_tensor_1 = torch._C._nn.linear( + context_layer_3, + l_self_modules_h_modules_1_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_1_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_3 = l_self_modules_h_modules_1_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_1_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_4 = torch.nn.functional.dropout(output_tensor_1, p=0.0, training=False) + output_tensor_1 = None + out_5 = out_3 + out_4 + out_3 = out_4 = None + layernorm_output_3 = torch.nn.functional.layer_norm( + out_5, + (1024,), + l_self_modules_h_modules_1_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_1_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_1_modules_post_attention_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_1_modules_post_attention_layernorm_parameters_bias_ + ) = None + linear_6 = torch._C._nn.linear( + layernorm_output_3, + l_self_modules_h_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_3 = l_self_modules_h_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_8 = linear_6 * 0.5 + mul_9 = 0.79788456 * linear_6 + mul_10 = 0.044715 * linear_6 + mul_11 = mul_10 * linear_6 + mul_10 = linear_6 = None + add_8 = 1 + mul_11 + mul_11 = None + mul_12 = mul_9 * add_8 + mul_9 = add_8 = None + tanh_1 = torch.tanh(mul_12) + mul_12 = None + add_9 = 1.0 + tanh_1 + tanh_1 = None + hidden_states_2 = mul_8 * add_9 + mul_8 = add_9 = None + intermediate_output_1 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_h_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_2 = l_self_modules_h_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_6 = torch.nn.functional.dropout( + intermediate_output_1, p=0.0, training=False + ) + intermediate_output_1 = None + out_7 = out_5 + out_6 + out_5 = out_6 = None + layernorm_output_4 = torch.nn.functional.layer_norm( + out_7, + (1024,), + l_self_modules_h_modules_2_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_2_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_2_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_2_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_4 = torch._C._nn.linear( + layernorm_output_4, + l_self_modules_h_modules_2_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_2_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_4 = l_self_modules_h_modules_2_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_2_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_5 = fused_qkv_4.view(1, 2, 16, 3, 64) + fused_qkv_4 = None + getitem_14 = fused_qkv_5[(Ellipsis, 0, slice(None, None, None))] + query_layer_4 = getitem_14.transpose(1, 2) + getitem_14 = None + getitem_15 = fused_qkv_5[(Ellipsis, 1, slice(None, None, None))] + key_layer_4 = getitem_15.transpose(1, 2) + getitem_15 = None + getitem_16 = fused_qkv_5[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_5 = None + value_layer_4 = getitem_16.transpose(1, 2) + getitem_16 = None + query_layer_5 = query_layer_4.reshape(16, -1, 64) + query_layer_4 = None + reshape_11 = key_layer_4.reshape(16, -1, 64) + key_layer_5 = reshape_11.transpose(-1, -2) + reshape_11 = None + value_layer_5 = value_layer_4.reshape(16, -1, 64) + attention_scores_2 = alibi_1.baddbmm( + batch1=query_layer_5, batch2=key_layer_5, beta=1.0, alpha=0.125 + ) + query_layer_5 = key_layer_5 = None + attn_weights_4 = attention_scores_2.view(1, 16, 2, -1) + attention_scores_2 = None + causal_mask_7 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_5 = attn_weights_4 + causal_mask_7 + attn_weights_4 = causal_mask_7 = None + softmax_2 = torch.nn.functional.softmax( + attn_weights_5, dim=-1, dtype=torch.float32 + ) + attn_weights_5 = None + attention_probs_4 = softmax_2.to(torch.float32) + softmax_2 = None + attention_probs_5 = torch.nn.functional.dropout( + attention_probs_4, 0.0, False, False + ) + attention_probs_4 = None + attention_probs_reshaped_2 = attention_probs_5.view(16, 2, -1) + attention_probs_5 = None + context_layer_4 = torch.bmm(attention_probs_reshaped_2, value_layer_5) + attention_probs_reshaped_2 = value_layer_5 = None + x_4 = context_layer_4.view(1, 16, 2, 64) + context_layer_4 = None + x_5 = x_4.permute(0, 2, 1, 3) + x_4 = None + context_layer_5 = x_5.reshape(1, 2, 1024) + x_5 = None + output_tensor_2 = torch._C._nn.linear( + context_layer_5, + l_self_modules_h_modules_2_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_2_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_5 = l_self_modules_h_modules_2_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_2_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_8 = torch.nn.functional.dropout(output_tensor_2, p=0.0, training=False) + output_tensor_2 = None + out_9 = out_7 + out_8 + out_7 = out_8 = None + layernorm_output_5 = torch.nn.functional.layer_norm( + out_9, + (1024,), + l_self_modules_h_modules_2_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_2_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_2_modules_post_attention_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_2_modules_post_attention_layernorm_parameters_bias_ + ) = None + linear_10 = torch._C._nn.linear( + layernorm_output_5, + l_self_modules_h_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_5 = l_self_modules_h_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_14 = linear_10 * 0.5 + mul_15 = 0.79788456 * linear_10 + mul_16 = 0.044715 * linear_10 + mul_17 = mul_16 * linear_10 + mul_16 = linear_10 = None + add_13 = 1 + mul_17 + mul_17 = None + mul_18 = mul_15 * add_13 + mul_15 = add_13 = None + tanh_2 = torch.tanh(mul_18) + mul_18 = None + add_14 = 1.0 + tanh_2 + tanh_2 = None + hidden_states_3 = mul_14 * add_14 + mul_14 = add_14 = None + intermediate_output_2 = torch._C._nn.linear( + hidden_states_3, + l_self_modules_h_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_3 = l_self_modules_h_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_10 = torch.nn.functional.dropout( + intermediate_output_2, p=0.0, training=False + ) + intermediate_output_2 = None + out_11 = out_9 + out_10 + out_9 = out_10 = None + layernorm_output_6 = torch.nn.functional.layer_norm( + out_11, + (1024,), + l_self_modules_h_modules_3_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_3_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_3_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_3_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_6 = torch._C._nn.linear( + layernorm_output_6, + l_self_modules_h_modules_3_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_3_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_6 = l_self_modules_h_modules_3_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_3_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_7 = fused_qkv_6.view(1, 2, 16, 3, 64) + fused_qkv_6 = None + getitem_18 = fused_qkv_7[(Ellipsis, 0, slice(None, None, None))] + query_layer_6 = getitem_18.transpose(1, 2) + getitem_18 = None + getitem_19 = fused_qkv_7[(Ellipsis, 1, slice(None, None, None))] + key_layer_6 = getitem_19.transpose(1, 2) + getitem_19 = None + getitem_20 = fused_qkv_7[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_7 = None + value_layer_6 = getitem_20.transpose(1, 2) + getitem_20 = None + query_layer_7 = query_layer_6.reshape(16, -1, 64) + query_layer_6 = None + reshape_15 = key_layer_6.reshape(16, -1, 64) + key_layer_7 = reshape_15.transpose(-1, -2) + reshape_15 = None + value_layer_7 = value_layer_6.reshape(16, -1, 64) + attention_scores_3 = alibi_1.baddbmm( + batch1=query_layer_7, batch2=key_layer_7, beta=1.0, alpha=0.125 + ) + query_layer_7 = key_layer_7 = None + attn_weights_6 = attention_scores_3.view(1, 16, 2, -1) + attention_scores_3 = None + causal_mask_8 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_7 = attn_weights_6 + causal_mask_8 + attn_weights_6 = causal_mask_8 = None + softmax_3 = torch.nn.functional.softmax( + attn_weights_7, dim=-1, dtype=torch.float32 + ) + attn_weights_7 = None + attention_probs_6 = softmax_3.to(torch.float32) + softmax_3 = None + attention_probs_7 = torch.nn.functional.dropout( + attention_probs_6, 0.0, False, False + ) + attention_probs_6 = None + attention_probs_reshaped_3 = attention_probs_7.view(16, 2, -1) + attention_probs_7 = None + context_layer_6 = torch.bmm(attention_probs_reshaped_3, value_layer_7) + attention_probs_reshaped_3 = value_layer_7 = None + x_6 = context_layer_6.view(1, 16, 2, 64) + context_layer_6 = None + x_7 = x_6.permute(0, 2, 1, 3) + x_6 = None + context_layer_7 = x_7.reshape(1, 2, 1024) + x_7 = None + output_tensor_3 = torch._C._nn.linear( + context_layer_7, + l_self_modules_h_modules_3_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_3_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_7 = l_self_modules_h_modules_3_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_3_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_12 = torch.nn.functional.dropout(output_tensor_3, p=0.0, training=False) + output_tensor_3 = None + out_13 = out_11 + out_12 + out_11 = out_12 = None + layernorm_output_7 = torch.nn.functional.layer_norm( + out_13, + (1024,), + l_self_modules_h_modules_3_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_3_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_3_modules_post_attention_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_3_modules_post_attention_layernorm_parameters_bias_ + ) = None + linear_14 = torch._C._nn.linear( + layernorm_output_7, + l_self_modules_h_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_7 = l_self_modules_h_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_20 = linear_14 * 0.5 + mul_21 = 0.79788456 * linear_14 + mul_22 = 0.044715 * linear_14 + mul_23 = mul_22 * linear_14 + mul_22 = linear_14 = None + add_18 = 1 + mul_23 + mul_23 = None + mul_24 = mul_21 * add_18 + mul_21 = add_18 = None + tanh_3 = torch.tanh(mul_24) + mul_24 = None + add_19 = 1.0 + tanh_3 + tanh_3 = None + hidden_states_4 = mul_20 * add_19 + mul_20 = add_19 = None + intermediate_output_3 = torch._C._nn.linear( + hidden_states_4, + l_self_modules_h_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_4 = l_self_modules_h_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_14 = torch.nn.functional.dropout( + intermediate_output_3, p=0.0, training=False + ) + intermediate_output_3 = None + out_15 = out_13 + out_14 + out_13 = out_14 = None + layernorm_output_8 = torch.nn.functional.layer_norm( + out_15, + (1024,), + l_self_modules_h_modules_4_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_4_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_4_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_4_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_8 = torch._C._nn.linear( + layernorm_output_8, + l_self_modules_h_modules_4_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_4_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_8 = l_self_modules_h_modules_4_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_4_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_9 = fused_qkv_8.view(1, 2, 16, 3, 64) + fused_qkv_8 = None + getitem_22 = fused_qkv_9[(Ellipsis, 0, slice(None, None, None))] + query_layer_8 = getitem_22.transpose(1, 2) + getitem_22 = None + getitem_23 = fused_qkv_9[(Ellipsis, 1, slice(None, None, None))] + key_layer_8 = getitem_23.transpose(1, 2) + getitem_23 = None + getitem_24 = fused_qkv_9[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_9 = None + value_layer_8 = getitem_24.transpose(1, 2) + getitem_24 = None + query_layer_9 = query_layer_8.reshape(16, -1, 64) + query_layer_8 = None + reshape_19 = key_layer_8.reshape(16, -1, 64) + key_layer_9 = reshape_19.transpose(-1, -2) + reshape_19 = None + value_layer_9 = value_layer_8.reshape(16, -1, 64) + attention_scores_4 = alibi_1.baddbmm( + batch1=query_layer_9, batch2=key_layer_9, beta=1.0, alpha=0.125 + ) + query_layer_9 = key_layer_9 = None + attn_weights_8 = attention_scores_4.view(1, 16, 2, -1) + attention_scores_4 = None + causal_mask_9 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_9 = attn_weights_8 + causal_mask_9 + attn_weights_8 = causal_mask_9 = None + softmax_4 = torch.nn.functional.softmax( + attn_weights_9, dim=-1, dtype=torch.float32 + ) + attn_weights_9 = None + attention_probs_8 = softmax_4.to(torch.float32) + softmax_4 = None + attention_probs_9 = torch.nn.functional.dropout( + attention_probs_8, 0.0, False, False + ) + attention_probs_8 = None + attention_probs_reshaped_4 = attention_probs_9.view(16, 2, -1) + attention_probs_9 = None + context_layer_8 = torch.bmm(attention_probs_reshaped_4, value_layer_9) + attention_probs_reshaped_4 = value_layer_9 = None + x_8 = context_layer_8.view(1, 16, 2, 64) + context_layer_8 = None + x_9 = x_8.permute(0, 2, 1, 3) + x_8 = None + context_layer_9 = x_9.reshape(1, 2, 1024) + x_9 = None + output_tensor_4 = torch._C._nn.linear( + context_layer_9, + l_self_modules_h_modules_4_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_4_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_9 = l_self_modules_h_modules_4_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_4_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_16 = torch.nn.functional.dropout(output_tensor_4, p=0.0, training=False) + output_tensor_4 = None + out_17 = out_15 + out_16 + out_15 = out_16 = None + layernorm_output_9 = torch.nn.functional.layer_norm( + out_17, + (1024,), + l_self_modules_h_modules_4_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_4_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_4_modules_post_attention_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_4_modules_post_attention_layernorm_parameters_bias_ + ) = None + linear_18 = torch._C._nn.linear( + layernorm_output_9, + l_self_modules_h_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_9 = l_self_modules_h_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_26 = linear_18 * 0.5 + mul_27 = 0.79788456 * linear_18 + mul_28 = 0.044715 * linear_18 + mul_29 = mul_28 * linear_18 + mul_28 = linear_18 = None + add_23 = 1 + mul_29 + mul_29 = None + mul_30 = mul_27 * add_23 + mul_27 = add_23 = None + tanh_4 = torch.tanh(mul_30) + mul_30 = None + add_24 = 1.0 + tanh_4 + tanh_4 = None + hidden_states_5 = mul_26 * add_24 + mul_26 = add_24 = None + intermediate_output_4 = torch._C._nn.linear( + hidden_states_5, + l_self_modules_h_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_5 = l_self_modules_h_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_18 = torch.nn.functional.dropout( + intermediate_output_4, p=0.0, training=False + ) + intermediate_output_4 = None + out_19 = out_17 + out_18 + out_17 = out_18 = None + layernorm_output_10 = torch.nn.functional.layer_norm( + out_19, + (1024,), + l_self_modules_h_modules_5_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_5_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_5_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_5_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_10 = torch._C._nn.linear( + layernorm_output_10, + l_self_modules_h_modules_5_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_5_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_10 = l_self_modules_h_modules_5_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_5_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_11 = fused_qkv_10.view(1, 2, 16, 3, 64) + fused_qkv_10 = None + getitem_26 = fused_qkv_11[(Ellipsis, 0, slice(None, None, None))] + query_layer_10 = getitem_26.transpose(1, 2) + getitem_26 = None + getitem_27 = fused_qkv_11[(Ellipsis, 1, slice(None, None, None))] + key_layer_10 = getitem_27.transpose(1, 2) + getitem_27 = None + getitem_28 = fused_qkv_11[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_11 = None + value_layer_10 = getitem_28.transpose(1, 2) + getitem_28 = None + query_layer_11 = query_layer_10.reshape(16, -1, 64) + query_layer_10 = None + reshape_23 = key_layer_10.reshape(16, -1, 64) + key_layer_11 = reshape_23.transpose(-1, -2) + reshape_23 = None + value_layer_11 = value_layer_10.reshape(16, -1, 64) + attention_scores_5 = alibi_1.baddbmm( + batch1=query_layer_11, batch2=key_layer_11, beta=1.0, alpha=0.125 + ) + query_layer_11 = key_layer_11 = None + attn_weights_10 = attention_scores_5.view(1, 16, 2, -1) + attention_scores_5 = None + causal_mask_10 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_11 = attn_weights_10 + causal_mask_10 + attn_weights_10 = causal_mask_10 = None + softmax_5 = torch.nn.functional.softmax( + attn_weights_11, dim=-1, dtype=torch.float32 + ) + attn_weights_11 = None + attention_probs_10 = softmax_5.to(torch.float32) + softmax_5 = None + attention_probs_11 = torch.nn.functional.dropout( + attention_probs_10, 0.0, False, False + ) + attention_probs_10 = None + attention_probs_reshaped_5 = attention_probs_11.view(16, 2, -1) + attention_probs_11 = None + context_layer_10 = torch.bmm(attention_probs_reshaped_5, value_layer_11) + attention_probs_reshaped_5 = value_layer_11 = None + x_10 = context_layer_10.view(1, 16, 2, 64) + context_layer_10 = None + x_11 = x_10.permute(0, 2, 1, 3) + x_10 = None + context_layer_11 = x_11.reshape(1, 2, 1024) + x_11 = None + output_tensor_5 = torch._C._nn.linear( + context_layer_11, + l_self_modules_h_modules_5_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_5_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_11 = l_self_modules_h_modules_5_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_5_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_20 = torch.nn.functional.dropout(output_tensor_5, p=0.0, training=False) + output_tensor_5 = None + out_21 = out_19 + out_20 + out_19 = out_20 = None + layernorm_output_11 = torch.nn.functional.layer_norm( + out_21, + (1024,), + l_self_modules_h_modules_5_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_5_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_5_modules_post_attention_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_5_modules_post_attention_layernorm_parameters_bias_ + ) = None + linear_22 = torch._C._nn.linear( + layernorm_output_11, + l_self_modules_h_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_11 = l_self_modules_h_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_32 = linear_22 * 0.5 + mul_33 = 0.79788456 * linear_22 + mul_34 = 0.044715 * linear_22 + mul_35 = mul_34 * linear_22 + mul_34 = linear_22 = None + add_28 = 1 + mul_35 + mul_35 = None + mul_36 = mul_33 * add_28 + mul_33 = add_28 = None + tanh_5 = torch.tanh(mul_36) + mul_36 = None + add_29 = 1.0 + tanh_5 + tanh_5 = None + hidden_states_6 = mul_32 * add_29 + mul_32 = add_29 = None + intermediate_output_5 = torch._C._nn.linear( + hidden_states_6, + l_self_modules_h_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_6 = l_self_modules_h_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_22 = torch.nn.functional.dropout( + intermediate_output_5, p=0.0, training=False + ) + intermediate_output_5 = None + out_23 = out_21 + out_22 + out_21 = out_22 = None + layernorm_output_12 = torch.nn.functional.layer_norm( + out_23, + (1024,), + l_self_modules_h_modules_6_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_6_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_6_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_6_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_12 = torch._C._nn.linear( + layernorm_output_12, + l_self_modules_h_modules_6_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_6_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_12 = l_self_modules_h_modules_6_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_6_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_13 = fused_qkv_12.view(1, 2, 16, 3, 64) + fused_qkv_12 = None + getitem_30 = fused_qkv_13[(Ellipsis, 0, slice(None, None, None))] + query_layer_12 = getitem_30.transpose(1, 2) + getitem_30 = None + getitem_31 = fused_qkv_13[(Ellipsis, 1, slice(None, None, None))] + key_layer_12 = getitem_31.transpose(1, 2) + getitem_31 = None + getitem_32 = fused_qkv_13[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_13 = None + value_layer_12 = getitem_32.transpose(1, 2) + getitem_32 = None + query_layer_13 = query_layer_12.reshape(16, -1, 64) + query_layer_12 = None + reshape_27 = key_layer_12.reshape(16, -1, 64) + key_layer_13 = reshape_27.transpose(-1, -2) + reshape_27 = None + value_layer_13 = value_layer_12.reshape(16, -1, 64) + attention_scores_6 = alibi_1.baddbmm( + batch1=query_layer_13, batch2=key_layer_13, beta=1.0, alpha=0.125 + ) + query_layer_13 = key_layer_13 = None + attn_weights_12 = attention_scores_6.view(1, 16, 2, -1) + attention_scores_6 = None + causal_mask_11 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_13 = attn_weights_12 + causal_mask_11 + attn_weights_12 = causal_mask_11 = None + softmax_6 = torch.nn.functional.softmax( + attn_weights_13, dim=-1, dtype=torch.float32 + ) + attn_weights_13 = None + attention_probs_12 = softmax_6.to(torch.float32) + softmax_6 = None + attention_probs_13 = torch.nn.functional.dropout( + attention_probs_12, 0.0, False, False + ) + attention_probs_12 = None + attention_probs_reshaped_6 = attention_probs_13.view(16, 2, -1) + attention_probs_13 = None + context_layer_12 = torch.bmm(attention_probs_reshaped_6, value_layer_13) + attention_probs_reshaped_6 = value_layer_13 = None + x_12 = context_layer_12.view(1, 16, 2, 64) + context_layer_12 = None + x_13 = x_12.permute(0, 2, 1, 3) + x_12 = None + context_layer_13 = x_13.reshape(1, 2, 1024) + x_13 = None + output_tensor_6 = torch._C._nn.linear( + context_layer_13, + l_self_modules_h_modules_6_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_6_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_13 = l_self_modules_h_modules_6_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_6_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_24 = torch.nn.functional.dropout(output_tensor_6, p=0.0, training=False) + output_tensor_6 = None + out_25 = out_23 + out_24 + out_23 = out_24 = None + layernorm_output_13 = torch.nn.functional.layer_norm( + out_25, + (1024,), + l_self_modules_h_modules_6_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_6_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_6_modules_post_attention_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_6_modules_post_attention_layernorm_parameters_bias_ + ) = None + linear_26 = torch._C._nn.linear( + layernorm_output_13, + l_self_modules_h_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_13 = l_self_modules_h_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_38 = linear_26 * 0.5 + mul_39 = 0.79788456 * linear_26 + mul_40 = 0.044715 * linear_26 + mul_41 = mul_40 * linear_26 + mul_40 = linear_26 = None + add_33 = 1 + mul_41 + mul_41 = None + mul_42 = mul_39 * add_33 + mul_39 = add_33 = None + tanh_6 = torch.tanh(mul_42) + mul_42 = None + add_34 = 1.0 + tanh_6 + tanh_6 = None + hidden_states_7 = mul_38 * add_34 + mul_38 = add_34 = None + intermediate_output_6 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_h_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_7 = l_self_modules_h_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_26 = torch.nn.functional.dropout( + intermediate_output_6, p=0.0, training=False + ) + intermediate_output_6 = None + out_27 = out_25 + out_26 + out_25 = out_26 = None + layernorm_output_14 = torch.nn.functional.layer_norm( + out_27, + (1024,), + l_self_modules_h_modules_7_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_7_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_7_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_7_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_14 = torch._C._nn.linear( + layernorm_output_14, + l_self_modules_h_modules_7_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_7_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_14 = l_self_modules_h_modules_7_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_7_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_15 = fused_qkv_14.view(1, 2, 16, 3, 64) + fused_qkv_14 = None + getitem_34 = fused_qkv_15[(Ellipsis, 0, slice(None, None, None))] + query_layer_14 = getitem_34.transpose(1, 2) + getitem_34 = None + getitem_35 = fused_qkv_15[(Ellipsis, 1, slice(None, None, None))] + key_layer_14 = getitem_35.transpose(1, 2) + getitem_35 = None + getitem_36 = fused_qkv_15[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_15 = None + value_layer_14 = getitem_36.transpose(1, 2) + getitem_36 = None + query_layer_15 = query_layer_14.reshape(16, -1, 64) + query_layer_14 = None + reshape_31 = key_layer_14.reshape(16, -1, 64) + key_layer_15 = reshape_31.transpose(-1, -2) + reshape_31 = None + value_layer_15 = value_layer_14.reshape(16, -1, 64) + attention_scores_7 = alibi_1.baddbmm( + batch1=query_layer_15, batch2=key_layer_15, beta=1.0, alpha=0.125 + ) + query_layer_15 = key_layer_15 = None + attn_weights_14 = attention_scores_7.view(1, 16, 2, -1) + attention_scores_7 = None + causal_mask_12 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_15 = attn_weights_14 + causal_mask_12 + attn_weights_14 = causal_mask_12 = None + softmax_7 = torch.nn.functional.softmax( + attn_weights_15, dim=-1, dtype=torch.float32 + ) + attn_weights_15 = None + attention_probs_14 = softmax_7.to(torch.float32) + softmax_7 = None + attention_probs_15 = torch.nn.functional.dropout( + attention_probs_14, 0.0, False, False + ) + attention_probs_14 = None + attention_probs_reshaped_7 = attention_probs_15.view(16, 2, -1) + attention_probs_15 = None + context_layer_14 = torch.bmm(attention_probs_reshaped_7, value_layer_15) + attention_probs_reshaped_7 = value_layer_15 = None + x_14 = context_layer_14.view(1, 16, 2, 64) + context_layer_14 = None + x_15 = x_14.permute(0, 2, 1, 3) + x_14 = None + context_layer_15 = x_15.reshape(1, 2, 1024) + x_15 = None + output_tensor_7 = torch._C._nn.linear( + context_layer_15, + l_self_modules_h_modules_7_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_7_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_15 = l_self_modules_h_modules_7_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_7_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_28 = torch.nn.functional.dropout(output_tensor_7, p=0.0, training=False) + output_tensor_7 = None + out_29 = out_27 + out_28 + out_27 = out_28 = None + layernorm_output_15 = torch.nn.functional.layer_norm( + out_29, + (1024,), + l_self_modules_h_modules_7_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_7_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_7_modules_post_attention_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_7_modules_post_attention_layernorm_parameters_bias_ + ) = None + linear_30 = torch._C._nn.linear( + layernorm_output_15, + l_self_modules_h_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_15 = l_self_modules_h_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_44 = linear_30 * 0.5 + mul_45 = 0.79788456 * linear_30 + mul_46 = 0.044715 * linear_30 + mul_47 = mul_46 * linear_30 + mul_46 = linear_30 = None + add_38 = 1 + mul_47 + mul_47 = None + mul_48 = mul_45 * add_38 + mul_45 = add_38 = None + tanh_7 = torch.tanh(mul_48) + mul_48 = None + add_39 = 1.0 + tanh_7 + tanh_7 = None + hidden_states_8 = mul_44 * add_39 + mul_44 = add_39 = None + intermediate_output_7 = torch._C._nn.linear( + hidden_states_8, + l_self_modules_h_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_8 = l_self_modules_h_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_30 = torch.nn.functional.dropout( + intermediate_output_7, p=0.0, training=False + ) + intermediate_output_7 = None + out_31 = out_29 + out_30 + out_29 = out_30 = None + layernorm_output_16 = torch.nn.functional.layer_norm( + out_31, + (1024,), + l_self_modules_h_modules_8_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_8_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_8_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_8_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_16 = torch._C._nn.linear( + layernorm_output_16, + l_self_modules_h_modules_8_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_8_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_16 = l_self_modules_h_modules_8_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_8_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_17 = fused_qkv_16.view(1, 2, 16, 3, 64) + fused_qkv_16 = None + getitem_38 = fused_qkv_17[(Ellipsis, 0, slice(None, None, None))] + query_layer_16 = getitem_38.transpose(1, 2) + getitem_38 = None + getitem_39 = fused_qkv_17[(Ellipsis, 1, slice(None, None, None))] + key_layer_16 = getitem_39.transpose(1, 2) + getitem_39 = None + getitem_40 = fused_qkv_17[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_17 = None + value_layer_16 = getitem_40.transpose(1, 2) + getitem_40 = None + query_layer_17 = query_layer_16.reshape(16, -1, 64) + query_layer_16 = None + reshape_35 = key_layer_16.reshape(16, -1, 64) + key_layer_17 = reshape_35.transpose(-1, -2) + reshape_35 = None + value_layer_17 = value_layer_16.reshape(16, -1, 64) + attention_scores_8 = alibi_1.baddbmm( + batch1=query_layer_17, batch2=key_layer_17, beta=1.0, alpha=0.125 + ) + query_layer_17 = key_layer_17 = None + attn_weights_16 = attention_scores_8.view(1, 16, 2, -1) + attention_scores_8 = None + causal_mask_13 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_17 = attn_weights_16 + causal_mask_13 + attn_weights_16 = causal_mask_13 = None + softmax_8 = torch.nn.functional.softmax( + attn_weights_17, dim=-1, dtype=torch.float32 + ) + attn_weights_17 = None + attention_probs_16 = softmax_8.to(torch.float32) + softmax_8 = None + attention_probs_17 = torch.nn.functional.dropout( + attention_probs_16, 0.0, False, False + ) + attention_probs_16 = None + attention_probs_reshaped_8 = attention_probs_17.view(16, 2, -1) + attention_probs_17 = None + context_layer_16 = torch.bmm(attention_probs_reshaped_8, value_layer_17) + attention_probs_reshaped_8 = value_layer_17 = None + x_16 = context_layer_16.view(1, 16, 2, 64) + context_layer_16 = None + x_17 = x_16.permute(0, 2, 1, 3) + x_16 = None + context_layer_17 = x_17.reshape(1, 2, 1024) + x_17 = None + output_tensor_8 = torch._C._nn.linear( + context_layer_17, + l_self_modules_h_modules_8_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_8_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_17 = l_self_modules_h_modules_8_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_8_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_32 = torch.nn.functional.dropout(output_tensor_8, p=0.0, training=False) + output_tensor_8 = None + out_33 = out_31 + out_32 + out_31 = out_32 = None + layernorm_output_17 = torch.nn.functional.layer_norm( + out_33, + (1024,), + l_self_modules_h_modules_8_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_8_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_8_modules_post_attention_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_8_modules_post_attention_layernorm_parameters_bias_ + ) = None + linear_34 = torch._C._nn.linear( + layernorm_output_17, + l_self_modules_h_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_17 = l_self_modules_h_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_50 = linear_34 * 0.5 + mul_51 = 0.79788456 * linear_34 + mul_52 = 0.044715 * linear_34 + mul_53 = mul_52 * linear_34 + mul_52 = linear_34 = None + add_43 = 1 + mul_53 + mul_53 = None + mul_54 = mul_51 * add_43 + mul_51 = add_43 = None + tanh_8 = torch.tanh(mul_54) + mul_54 = None + add_44 = 1.0 + tanh_8 + tanh_8 = None + hidden_states_9 = mul_50 * add_44 + mul_50 = add_44 = None + intermediate_output_8 = torch._C._nn.linear( + hidden_states_9, + l_self_modules_h_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_9 = l_self_modules_h_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_34 = torch.nn.functional.dropout( + intermediate_output_8, p=0.0, training=False + ) + intermediate_output_8 = None + out_35 = out_33 + out_34 + out_33 = out_34 = None + layernorm_output_18 = torch.nn.functional.layer_norm( + out_35, + (1024,), + l_self_modules_h_modules_9_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_9_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_9_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_9_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_18 = torch._C._nn.linear( + layernorm_output_18, + l_self_modules_h_modules_9_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_9_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_18 = l_self_modules_h_modules_9_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_9_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_19 = fused_qkv_18.view(1, 2, 16, 3, 64) + fused_qkv_18 = None + getitem_42 = fused_qkv_19[(Ellipsis, 0, slice(None, None, None))] + query_layer_18 = getitem_42.transpose(1, 2) + getitem_42 = None + getitem_43 = fused_qkv_19[(Ellipsis, 1, slice(None, None, None))] + key_layer_18 = getitem_43.transpose(1, 2) + getitem_43 = None + getitem_44 = fused_qkv_19[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_19 = None + value_layer_18 = getitem_44.transpose(1, 2) + getitem_44 = None + query_layer_19 = query_layer_18.reshape(16, -1, 64) + query_layer_18 = None + reshape_39 = key_layer_18.reshape(16, -1, 64) + key_layer_19 = reshape_39.transpose(-1, -2) + reshape_39 = None + value_layer_19 = value_layer_18.reshape(16, -1, 64) + attention_scores_9 = alibi_1.baddbmm( + batch1=query_layer_19, batch2=key_layer_19, beta=1.0, alpha=0.125 + ) + query_layer_19 = key_layer_19 = None + attn_weights_18 = attention_scores_9.view(1, 16, 2, -1) + attention_scores_9 = None + causal_mask_14 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_19 = attn_weights_18 + causal_mask_14 + attn_weights_18 = causal_mask_14 = None + softmax_9 = torch.nn.functional.softmax( + attn_weights_19, dim=-1, dtype=torch.float32 + ) + attn_weights_19 = None + attention_probs_18 = softmax_9.to(torch.float32) + softmax_9 = None + attention_probs_19 = torch.nn.functional.dropout( + attention_probs_18, 0.0, False, False + ) + attention_probs_18 = None + attention_probs_reshaped_9 = attention_probs_19.view(16, 2, -1) + attention_probs_19 = None + context_layer_18 = torch.bmm(attention_probs_reshaped_9, value_layer_19) + attention_probs_reshaped_9 = value_layer_19 = None + x_18 = context_layer_18.view(1, 16, 2, 64) + context_layer_18 = None + x_19 = x_18.permute(0, 2, 1, 3) + x_18 = None + context_layer_19 = x_19.reshape(1, 2, 1024) + x_19 = None + output_tensor_9 = torch._C._nn.linear( + context_layer_19, + l_self_modules_h_modules_9_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_9_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_19 = l_self_modules_h_modules_9_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_9_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_36 = torch.nn.functional.dropout(output_tensor_9, p=0.0, training=False) + output_tensor_9 = None + out_37 = out_35 + out_36 + out_35 = out_36 = None + layernorm_output_19 = torch.nn.functional.layer_norm( + out_37, + (1024,), + l_self_modules_h_modules_9_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_9_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_9_modules_post_attention_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_9_modules_post_attention_layernorm_parameters_bias_ + ) = None + linear_38 = torch._C._nn.linear( + layernorm_output_19, + l_self_modules_h_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_19 = l_self_modules_h_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_56 = linear_38 * 0.5 + mul_57 = 0.79788456 * linear_38 + mul_58 = 0.044715 * linear_38 + mul_59 = mul_58 * linear_38 + mul_58 = linear_38 = None + add_48 = 1 + mul_59 + mul_59 = None + mul_60 = mul_57 * add_48 + mul_57 = add_48 = None + tanh_9 = torch.tanh(mul_60) + mul_60 = None + add_49 = 1.0 + tanh_9 + tanh_9 = None + hidden_states_10 = mul_56 * add_49 + mul_56 = add_49 = None + intermediate_output_9 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_h_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_10 = l_self_modules_h_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_38 = torch.nn.functional.dropout( + intermediate_output_9, p=0.0, training=False + ) + intermediate_output_9 = None + out_39 = out_37 + out_38 + out_37 = out_38 = None + layernorm_output_20 = torch.nn.functional.layer_norm( + out_39, + (1024,), + l_self_modules_h_modules_10_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_10_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_10_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_10_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_20 = torch._C._nn.linear( + layernorm_output_20, + l_self_modules_h_modules_10_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_10_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_20 = l_self_modules_h_modules_10_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_10_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_21 = fused_qkv_20.view(1, 2, 16, 3, 64) + fused_qkv_20 = None + getitem_46 = fused_qkv_21[(Ellipsis, 0, slice(None, None, None))] + query_layer_20 = getitem_46.transpose(1, 2) + getitem_46 = None + getitem_47 = fused_qkv_21[(Ellipsis, 1, slice(None, None, None))] + key_layer_20 = getitem_47.transpose(1, 2) + getitem_47 = None + getitem_48 = fused_qkv_21[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_21 = None + value_layer_20 = getitem_48.transpose(1, 2) + getitem_48 = None + query_layer_21 = query_layer_20.reshape(16, -1, 64) + query_layer_20 = None + reshape_43 = key_layer_20.reshape(16, -1, 64) + key_layer_21 = reshape_43.transpose(-1, -2) + reshape_43 = None + value_layer_21 = value_layer_20.reshape(16, -1, 64) + attention_scores_10 = alibi_1.baddbmm( + batch1=query_layer_21, batch2=key_layer_21, beta=1.0, alpha=0.125 + ) + query_layer_21 = key_layer_21 = None + attn_weights_20 = attention_scores_10.view(1, 16, 2, -1) + attention_scores_10 = None + causal_mask_15 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_21 = attn_weights_20 + causal_mask_15 + attn_weights_20 = causal_mask_15 = None + softmax_10 = torch.nn.functional.softmax( + attn_weights_21, dim=-1, dtype=torch.float32 + ) + attn_weights_21 = None + attention_probs_20 = softmax_10.to(torch.float32) + softmax_10 = None + attention_probs_21 = torch.nn.functional.dropout( + attention_probs_20, 0.0, False, False + ) + attention_probs_20 = None + attention_probs_reshaped_10 = attention_probs_21.view(16, 2, -1) + attention_probs_21 = None + context_layer_20 = torch.bmm(attention_probs_reshaped_10, value_layer_21) + attention_probs_reshaped_10 = value_layer_21 = None + x_20 = context_layer_20.view(1, 16, 2, 64) + context_layer_20 = None + x_21 = x_20.permute(0, 2, 1, 3) + x_20 = None + context_layer_21 = x_21.reshape(1, 2, 1024) + x_21 = None + output_tensor_10 = torch._C._nn.linear( + context_layer_21, + l_self_modules_h_modules_10_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_10_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_21 = l_self_modules_h_modules_10_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_10_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_40 = torch.nn.functional.dropout(output_tensor_10, p=0.0, training=False) + output_tensor_10 = None + out_41 = out_39 + out_40 + out_39 = out_40 = None + layernorm_output_21 = torch.nn.functional.layer_norm( + out_41, + (1024,), + l_self_modules_h_modules_10_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_10_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_10_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_h_modules_10_modules_post_attention_layernorm_parameters_bias_ = (None) + linear_42 = torch._C._nn.linear( + layernorm_output_21, + l_self_modules_h_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_21 = l_self_modules_h_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_62 = linear_42 * 0.5 + mul_63 = 0.79788456 * linear_42 + mul_64 = 0.044715 * linear_42 + mul_65 = mul_64 * linear_42 + mul_64 = linear_42 = None + add_53 = 1 + mul_65 + mul_65 = None + mul_66 = mul_63 * add_53 + mul_63 = add_53 = None + tanh_10 = torch.tanh(mul_66) + mul_66 = None + add_54 = 1.0 + tanh_10 + tanh_10 = None + hidden_states_11 = mul_62 * add_54 + mul_62 = add_54 = None + intermediate_output_10 = torch._C._nn.linear( + hidden_states_11, + l_self_modules_h_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_11 = l_self_modules_h_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_42 = torch.nn.functional.dropout( + intermediate_output_10, p=0.0, training=False + ) + intermediate_output_10 = None + out_43 = out_41 + out_42 + out_41 = out_42 = None + layernorm_output_22 = torch.nn.functional.layer_norm( + out_43, + (1024,), + l_self_modules_h_modules_11_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_11_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_11_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_11_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_22 = torch._C._nn.linear( + layernorm_output_22, + l_self_modules_h_modules_11_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_11_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_22 = l_self_modules_h_modules_11_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_11_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_23 = fused_qkv_22.view(1, 2, 16, 3, 64) + fused_qkv_22 = None + getitem_50 = fused_qkv_23[(Ellipsis, 0, slice(None, None, None))] + query_layer_22 = getitem_50.transpose(1, 2) + getitem_50 = None + getitem_51 = fused_qkv_23[(Ellipsis, 1, slice(None, None, None))] + key_layer_22 = getitem_51.transpose(1, 2) + getitem_51 = None + getitem_52 = fused_qkv_23[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_23 = None + value_layer_22 = getitem_52.transpose(1, 2) + getitem_52 = None + query_layer_23 = query_layer_22.reshape(16, -1, 64) + query_layer_22 = None + reshape_47 = key_layer_22.reshape(16, -1, 64) + key_layer_23 = reshape_47.transpose(-1, -2) + reshape_47 = None + value_layer_23 = value_layer_22.reshape(16, -1, 64) + attention_scores_11 = alibi_1.baddbmm( + batch1=query_layer_23, batch2=key_layer_23, beta=1.0, alpha=0.125 + ) + query_layer_23 = key_layer_23 = None + attn_weights_22 = attention_scores_11.view(1, 16, 2, -1) + attention_scores_11 = None + causal_mask_16 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_23 = attn_weights_22 + causal_mask_16 + attn_weights_22 = causal_mask_16 = None + softmax_11 = torch.nn.functional.softmax( + attn_weights_23, dim=-1, dtype=torch.float32 + ) + attn_weights_23 = None + attention_probs_22 = softmax_11.to(torch.float32) + softmax_11 = None + attention_probs_23 = torch.nn.functional.dropout( + attention_probs_22, 0.0, False, False + ) + attention_probs_22 = None + attention_probs_reshaped_11 = attention_probs_23.view(16, 2, -1) + attention_probs_23 = None + context_layer_22 = torch.bmm(attention_probs_reshaped_11, value_layer_23) + attention_probs_reshaped_11 = value_layer_23 = None + x_22 = context_layer_22.view(1, 16, 2, 64) + context_layer_22 = None + x_23 = x_22.permute(0, 2, 1, 3) + x_22 = None + context_layer_23 = x_23.reshape(1, 2, 1024) + x_23 = None + output_tensor_11 = torch._C._nn.linear( + context_layer_23, + l_self_modules_h_modules_11_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_11_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_23 = l_self_modules_h_modules_11_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_11_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_44 = torch.nn.functional.dropout(output_tensor_11, p=0.0, training=False) + output_tensor_11 = None + out_45 = out_43 + out_44 + out_43 = out_44 = None + layernorm_output_23 = torch.nn.functional.layer_norm( + out_45, + (1024,), + l_self_modules_h_modules_11_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_11_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_11_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_h_modules_11_modules_post_attention_layernorm_parameters_bias_ = (None) + linear_46 = torch._C._nn.linear( + layernorm_output_23, + l_self_modules_h_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_23 = l_self_modules_h_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_68 = linear_46 * 0.5 + mul_69 = 0.79788456 * linear_46 + mul_70 = 0.044715 * linear_46 + mul_71 = mul_70 * linear_46 + mul_70 = linear_46 = None + add_58 = 1 + mul_71 + mul_71 = None + mul_72 = mul_69 * add_58 + mul_69 = add_58 = None + tanh_11 = torch.tanh(mul_72) + mul_72 = None + add_59 = 1.0 + tanh_11 + tanh_11 = None + hidden_states_12 = mul_68 * add_59 + mul_68 = add_59 = None + intermediate_output_11 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_h_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_12 = l_self_modules_h_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_46 = torch.nn.functional.dropout( + intermediate_output_11, p=0.0, training=False + ) + intermediate_output_11 = None + out_47 = out_45 + out_46 + out_45 = out_46 = None + layernorm_output_24 = torch.nn.functional.layer_norm( + out_47, + (1024,), + l_self_modules_h_modules_12_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_12_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_12_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_12_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_24 = torch._C._nn.linear( + layernorm_output_24, + l_self_modules_h_modules_12_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_12_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_24 = l_self_modules_h_modules_12_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_12_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_25 = fused_qkv_24.view(1, 2, 16, 3, 64) + fused_qkv_24 = None + getitem_54 = fused_qkv_25[(Ellipsis, 0, slice(None, None, None))] + query_layer_24 = getitem_54.transpose(1, 2) + getitem_54 = None + getitem_55 = fused_qkv_25[(Ellipsis, 1, slice(None, None, None))] + key_layer_24 = getitem_55.transpose(1, 2) + getitem_55 = None + getitem_56 = fused_qkv_25[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_25 = None + value_layer_24 = getitem_56.transpose(1, 2) + getitem_56 = None + query_layer_25 = query_layer_24.reshape(16, -1, 64) + query_layer_24 = None + reshape_51 = key_layer_24.reshape(16, -1, 64) + key_layer_25 = reshape_51.transpose(-1, -2) + reshape_51 = None + value_layer_25 = value_layer_24.reshape(16, -1, 64) + attention_scores_12 = alibi_1.baddbmm( + batch1=query_layer_25, batch2=key_layer_25, beta=1.0, alpha=0.125 + ) + query_layer_25 = key_layer_25 = None + attn_weights_24 = attention_scores_12.view(1, 16, 2, -1) + attention_scores_12 = None + causal_mask_17 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_25 = attn_weights_24 + causal_mask_17 + attn_weights_24 = causal_mask_17 = None + softmax_12 = torch.nn.functional.softmax( + attn_weights_25, dim=-1, dtype=torch.float32 + ) + attn_weights_25 = None + attention_probs_24 = softmax_12.to(torch.float32) + softmax_12 = None + attention_probs_25 = torch.nn.functional.dropout( + attention_probs_24, 0.0, False, False + ) + attention_probs_24 = None + attention_probs_reshaped_12 = attention_probs_25.view(16, 2, -1) + attention_probs_25 = None + context_layer_24 = torch.bmm(attention_probs_reshaped_12, value_layer_25) + attention_probs_reshaped_12 = value_layer_25 = None + x_24 = context_layer_24.view(1, 16, 2, 64) + context_layer_24 = None + x_25 = x_24.permute(0, 2, 1, 3) + x_24 = None + context_layer_25 = x_25.reshape(1, 2, 1024) + x_25 = None + output_tensor_12 = torch._C._nn.linear( + context_layer_25, + l_self_modules_h_modules_12_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_12_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_25 = l_self_modules_h_modules_12_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_12_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_48 = torch.nn.functional.dropout(output_tensor_12, p=0.0, training=False) + output_tensor_12 = None + out_49 = out_47 + out_48 + out_47 = out_48 = None + layernorm_output_25 = torch.nn.functional.layer_norm( + out_49, + (1024,), + l_self_modules_h_modules_12_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_12_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_12_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_h_modules_12_modules_post_attention_layernorm_parameters_bias_ = (None) + linear_50 = torch._C._nn.linear( + layernorm_output_25, + l_self_modules_h_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_25 = l_self_modules_h_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_74 = linear_50 * 0.5 + mul_75 = 0.79788456 * linear_50 + mul_76 = 0.044715 * linear_50 + mul_77 = mul_76 * linear_50 + mul_76 = linear_50 = None + add_63 = 1 + mul_77 + mul_77 = None + mul_78 = mul_75 * add_63 + mul_75 = add_63 = None + tanh_12 = torch.tanh(mul_78) + mul_78 = None + add_64 = 1.0 + tanh_12 + tanh_12 = None + hidden_states_13 = mul_74 * add_64 + mul_74 = add_64 = None + intermediate_output_12 = torch._C._nn.linear( + hidden_states_13, + l_self_modules_h_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_13 = l_self_modules_h_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_50 = torch.nn.functional.dropout( + intermediate_output_12, p=0.0, training=False + ) + intermediate_output_12 = None + out_51 = out_49 + out_50 + out_49 = out_50 = None + layernorm_output_26 = torch.nn.functional.layer_norm( + out_51, + (1024,), + l_self_modules_h_modules_13_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_13_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_13_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_13_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_26 = torch._C._nn.linear( + layernorm_output_26, + l_self_modules_h_modules_13_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_13_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_26 = l_self_modules_h_modules_13_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_13_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_27 = fused_qkv_26.view(1, 2, 16, 3, 64) + fused_qkv_26 = None + getitem_58 = fused_qkv_27[(Ellipsis, 0, slice(None, None, None))] + query_layer_26 = getitem_58.transpose(1, 2) + getitem_58 = None + getitem_59 = fused_qkv_27[(Ellipsis, 1, slice(None, None, None))] + key_layer_26 = getitem_59.transpose(1, 2) + getitem_59 = None + getitem_60 = fused_qkv_27[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_27 = None + value_layer_26 = getitem_60.transpose(1, 2) + getitem_60 = None + query_layer_27 = query_layer_26.reshape(16, -1, 64) + query_layer_26 = None + reshape_55 = key_layer_26.reshape(16, -1, 64) + key_layer_27 = reshape_55.transpose(-1, -2) + reshape_55 = None + value_layer_27 = value_layer_26.reshape(16, -1, 64) + attention_scores_13 = alibi_1.baddbmm( + batch1=query_layer_27, batch2=key_layer_27, beta=1.0, alpha=0.125 + ) + query_layer_27 = key_layer_27 = None + attn_weights_26 = attention_scores_13.view(1, 16, 2, -1) + attention_scores_13 = None + causal_mask_18 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_27 = attn_weights_26 + causal_mask_18 + attn_weights_26 = causal_mask_18 = None + softmax_13 = torch.nn.functional.softmax( + attn_weights_27, dim=-1, dtype=torch.float32 + ) + attn_weights_27 = None + attention_probs_26 = softmax_13.to(torch.float32) + softmax_13 = None + attention_probs_27 = torch.nn.functional.dropout( + attention_probs_26, 0.0, False, False + ) + attention_probs_26 = None + attention_probs_reshaped_13 = attention_probs_27.view(16, 2, -1) + attention_probs_27 = None + context_layer_26 = torch.bmm(attention_probs_reshaped_13, value_layer_27) + attention_probs_reshaped_13 = value_layer_27 = None + x_26 = context_layer_26.view(1, 16, 2, 64) + context_layer_26 = None + x_27 = x_26.permute(0, 2, 1, 3) + x_26 = None + context_layer_27 = x_27.reshape(1, 2, 1024) + x_27 = None + output_tensor_13 = torch._C._nn.linear( + context_layer_27, + l_self_modules_h_modules_13_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_13_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_27 = l_self_modules_h_modules_13_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_13_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_52 = torch.nn.functional.dropout(output_tensor_13, p=0.0, training=False) + output_tensor_13 = None + out_53 = out_51 + out_52 + out_51 = out_52 = None + layernorm_output_27 = torch.nn.functional.layer_norm( + out_53, + (1024,), + l_self_modules_h_modules_13_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_13_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_13_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_h_modules_13_modules_post_attention_layernorm_parameters_bias_ = (None) + linear_54 = torch._C._nn.linear( + layernorm_output_27, + l_self_modules_h_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_27 = l_self_modules_h_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_80 = linear_54 * 0.5 + mul_81 = 0.79788456 * linear_54 + mul_82 = 0.044715 * linear_54 + mul_83 = mul_82 * linear_54 + mul_82 = linear_54 = None + add_68 = 1 + mul_83 + mul_83 = None + mul_84 = mul_81 * add_68 + mul_81 = add_68 = None + tanh_13 = torch.tanh(mul_84) + mul_84 = None + add_69 = 1.0 + tanh_13 + tanh_13 = None + hidden_states_14 = mul_80 * add_69 + mul_80 = add_69 = None + intermediate_output_13 = torch._C._nn.linear( + hidden_states_14, + l_self_modules_h_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_14 = l_self_modules_h_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_54 = torch.nn.functional.dropout( + intermediate_output_13, p=0.0, training=False + ) + intermediate_output_13 = None + out_55 = out_53 + out_54 + out_53 = out_54 = None + layernorm_output_28 = torch.nn.functional.layer_norm( + out_55, + (1024,), + l_self_modules_h_modules_14_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_14_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_14_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_14_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_28 = torch._C._nn.linear( + layernorm_output_28, + l_self_modules_h_modules_14_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_14_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_28 = l_self_modules_h_modules_14_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_14_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_29 = fused_qkv_28.view(1, 2, 16, 3, 64) + fused_qkv_28 = None + getitem_62 = fused_qkv_29[(Ellipsis, 0, slice(None, None, None))] + query_layer_28 = getitem_62.transpose(1, 2) + getitem_62 = None + getitem_63 = fused_qkv_29[(Ellipsis, 1, slice(None, None, None))] + key_layer_28 = getitem_63.transpose(1, 2) + getitem_63 = None + getitem_64 = fused_qkv_29[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_29 = None + value_layer_28 = getitem_64.transpose(1, 2) + getitem_64 = None + query_layer_29 = query_layer_28.reshape(16, -1, 64) + query_layer_28 = None + reshape_59 = key_layer_28.reshape(16, -1, 64) + key_layer_29 = reshape_59.transpose(-1, -2) + reshape_59 = None + value_layer_29 = value_layer_28.reshape(16, -1, 64) + attention_scores_14 = alibi_1.baddbmm( + batch1=query_layer_29, batch2=key_layer_29, beta=1.0, alpha=0.125 + ) + query_layer_29 = key_layer_29 = None + attn_weights_28 = attention_scores_14.view(1, 16, 2, -1) + attention_scores_14 = None + causal_mask_19 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_29 = attn_weights_28 + causal_mask_19 + attn_weights_28 = causal_mask_19 = None + softmax_14 = torch.nn.functional.softmax( + attn_weights_29, dim=-1, dtype=torch.float32 + ) + attn_weights_29 = None + attention_probs_28 = softmax_14.to(torch.float32) + softmax_14 = None + attention_probs_29 = torch.nn.functional.dropout( + attention_probs_28, 0.0, False, False + ) + attention_probs_28 = None + attention_probs_reshaped_14 = attention_probs_29.view(16, 2, -1) + attention_probs_29 = None + context_layer_28 = torch.bmm(attention_probs_reshaped_14, value_layer_29) + attention_probs_reshaped_14 = value_layer_29 = None + x_28 = context_layer_28.view(1, 16, 2, 64) + context_layer_28 = None + x_29 = x_28.permute(0, 2, 1, 3) + x_28 = None + context_layer_29 = x_29.reshape(1, 2, 1024) + x_29 = None + output_tensor_14 = torch._C._nn.linear( + context_layer_29, + l_self_modules_h_modules_14_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_14_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_29 = l_self_modules_h_modules_14_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_14_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_56 = torch.nn.functional.dropout(output_tensor_14, p=0.0, training=False) + output_tensor_14 = None + out_57 = out_55 + out_56 + out_55 = out_56 = None + layernorm_output_29 = torch.nn.functional.layer_norm( + out_57, + (1024,), + l_self_modules_h_modules_14_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_14_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_14_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_h_modules_14_modules_post_attention_layernorm_parameters_bias_ = (None) + linear_58 = torch._C._nn.linear( + layernorm_output_29, + l_self_modules_h_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_29 = l_self_modules_h_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_86 = linear_58 * 0.5 + mul_87 = 0.79788456 * linear_58 + mul_88 = 0.044715 * linear_58 + mul_89 = mul_88 * linear_58 + mul_88 = linear_58 = None + add_73 = 1 + mul_89 + mul_89 = None + mul_90 = mul_87 * add_73 + mul_87 = add_73 = None + tanh_14 = torch.tanh(mul_90) + mul_90 = None + add_74 = 1.0 + tanh_14 + tanh_14 = None + hidden_states_15 = mul_86 * add_74 + mul_86 = add_74 = None + intermediate_output_14 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_h_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_15 = l_self_modules_h_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_58 = torch.nn.functional.dropout( + intermediate_output_14, p=0.0, training=False + ) + intermediate_output_14 = None + out_59 = out_57 + out_58 + out_57 = out_58 = None + layernorm_output_30 = torch.nn.functional.layer_norm( + out_59, + (1024,), + l_self_modules_h_modules_15_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_15_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_15_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_15_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_30 = torch._C._nn.linear( + layernorm_output_30, + l_self_modules_h_modules_15_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_15_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_30 = l_self_modules_h_modules_15_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_15_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_31 = fused_qkv_30.view(1, 2, 16, 3, 64) + fused_qkv_30 = None + getitem_66 = fused_qkv_31[(Ellipsis, 0, slice(None, None, None))] + query_layer_30 = getitem_66.transpose(1, 2) + getitem_66 = None + getitem_67 = fused_qkv_31[(Ellipsis, 1, slice(None, None, None))] + key_layer_30 = getitem_67.transpose(1, 2) + getitem_67 = None + getitem_68 = fused_qkv_31[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_31 = None + value_layer_30 = getitem_68.transpose(1, 2) + getitem_68 = None + query_layer_31 = query_layer_30.reshape(16, -1, 64) + query_layer_30 = None + reshape_63 = key_layer_30.reshape(16, -1, 64) + key_layer_31 = reshape_63.transpose(-1, -2) + reshape_63 = None + value_layer_31 = value_layer_30.reshape(16, -1, 64) + attention_scores_15 = alibi_1.baddbmm( + batch1=query_layer_31, batch2=key_layer_31, beta=1.0, alpha=0.125 + ) + query_layer_31 = key_layer_31 = None + attn_weights_30 = attention_scores_15.view(1, 16, 2, -1) + attention_scores_15 = None + causal_mask_20 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_31 = attn_weights_30 + causal_mask_20 + attn_weights_30 = causal_mask_20 = None + softmax_15 = torch.nn.functional.softmax( + attn_weights_31, dim=-1, dtype=torch.float32 + ) + attn_weights_31 = None + attention_probs_30 = softmax_15.to(torch.float32) + softmax_15 = None + attention_probs_31 = torch.nn.functional.dropout( + attention_probs_30, 0.0, False, False + ) + attention_probs_30 = None + attention_probs_reshaped_15 = attention_probs_31.view(16, 2, -1) + attention_probs_31 = None + context_layer_30 = torch.bmm(attention_probs_reshaped_15, value_layer_31) + attention_probs_reshaped_15 = value_layer_31 = None + x_30 = context_layer_30.view(1, 16, 2, 64) + context_layer_30 = None + x_31 = x_30.permute(0, 2, 1, 3) + x_30 = None + context_layer_31 = x_31.reshape(1, 2, 1024) + x_31 = None + output_tensor_15 = torch._C._nn.linear( + context_layer_31, + l_self_modules_h_modules_15_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_15_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_31 = l_self_modules_h_modules_15_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_15_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_60 = torch.nn.functional.dropout(output_tensor_15, p=0.0, training=False) + output_tensor_15 = None + out_61 = out_59 + out_60 + out_59 = out_60 = None + layernorm_output_31 = torch.nn.functional.layer_norm( + out_61, + (1024,), + l_self_modules_h_modules_15_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_15_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_15_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_h_modules_15_modules_post_attention_layernorm_parameters_bias_ = (None) + linear_62 = torch._C._nn.linear( + layernorm_output_31, + l_self_modules_h_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_31 = l_self_modules_h_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_92 = linear_62 * 0.5 + mul_93 = 0.79788456 * linear_62 + mul_94 = 0.044715 * linear_62 + mul_95 = mul_94 * linear_62 + mul_94 = linear_62 = None + add_78 = 1 + mul_95 + mul_95 = None + mul_96 = mul_93 * add_78 + mul_93 = add_78 = None + tanh_15 = torch.tanh(mul_96) + mul_96 = None + add_79 = 1.0 + tanh_15 + tanh_15 = None + hidden_states_16 = mul_92 * add_79 + mul_92 = add_79 = None + intermediate_output_15 = torch._C._nn.linear( + hidden_states_16, + l_self_modules_h_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_16 = l_self_modules_h_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_62 = torch.nn.functional.dropout( + intermediate_output_15, p=0.0, training=False + ) + intermediate_output_15 = None + out_63 = out_61 + out_62 + out_61 = out_62 = None + layernorm_output_32 = torch.nn.functional.layer_norm( + out_63, + (1024,), + l_self_modules_h_modules_16_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_16_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_16_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_16_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_32 = torch._C._nn.linear( + layernorm_output_32, + l_self_modules_h_modules_16_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_16_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_32 = l_self_modules_h_modules_16_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_16_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_33 = fused_qkv_32.view(1, 2, 16, 3, 64) + fused_qkv_32 = None + getitem_70 = fused_qkv_33[(Ellipsis, 0, slice(None, None, None))] + query_layer_32 = getitem_70.transpose(1, 2) + getitem_70 = None + getitem_71 = fused_qkv_33[(Ellipsis, 1, slice(None, None, None))] + key_layer_32 = getitem_71.transpose(1, 2) + getitem_71 = None + getitem_72 = fused_qkv_33[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_33 = None + value_layer_32 = getitem_72.transpose(1, 2) + getitem_72 = None + query_layer_33 = query_layer_32.reshape(16, -1, 64) + query_layer_32 = None + reshape_67 = key_layer_32.reshape(16, -1, 64) + key_layer_33 = reshape_67.transpose(-1, -2) + reshape_67 = None + value_layer_33 = value_layer_32.reshape(16, -1, 64) + attention_scores_16 = alibi_1.baddbmm( + batch1=query_layer_33, batch2=key_layer_33, beta=1.0, alpha=0.125 + ) + query_layer_33 = key_layer_33 = None + attn_weights_32 = attention_scores_16.view(1, 16, 2, -1) + attention_scores_16 = None + causal_mask_21 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_33 = attn_weights_32 + causal_mask_21 + attn_weights_32 = causal_mask_21 = None + softmax_16 = torch.nn.functional.softmax( + attn_weights_33, dim=-1, dtype=torch.float32 + ) + attn_weights_33 = None + attention_probs_32 = softmax_16.to(torch.float32) + softmax_16 = None + attention_probs_33 = torch.nn.functional.dropout( + attention_probs_32, 0.0, False, False + ) + attention_probs_32 = None + attention_probs_reshaped_16 = attention_probs_33.view(16, 2, -1) + attention_probs_33 = None + context_layer_32 = torch.bmm(attention_probs_reshaped_16, value_layer_33) + attention_probs_reshaped_16 = value_layer_33 = None + x_32 = context_layer_32.view(1, 16, 2, 64) + context_layer_32 = None + x_33 = x_32.permute(0, 2, 1, 3) + x_32 = None + context_layer_33 = x_33.reshape(1, 2, 1024) + x_33 = None + output_tensor_16 = torch._C._nn.linear( + context_layer_33, + l_self_modules_h_modules_16_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_16_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_33 = l_self_modules_h_modules_16_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_16_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_64 = torch.nn.functional.dropout(output_tensor_16, p=0.0, training=False) + output_tensor_16 = None + out_65 = out_63 + out_64 + out_63 = out_64 = None + layernorm_output_33 = torch.nn.functional.layer_norm( + out_65, + (1024,), + l_self_modules_h_modules_16_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_16_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_16_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_h_modules_16_modules_post_attention_layernorm_parameters_bias_ = (None) + linear_66 = torch._C._nn.linear( + layernorm_output_33, + l_self_modules_h_modules_16_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_16_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_33 = l_self_modules_h_modules_16_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_16_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_98 = linear_66 * 0.5 + mul_99 = 0.79788456 * linear_66 + mul_100 = 0.044715 * linear_66 + mul_101 = mul_100 * linear_66 + mul_100 = linear_66 = None + add_83 = 1 + mul_101 + mul_101 = None + mul_102 = mul_99 * add_83 + mul_99 = add_83 = None + tanh_16 = torch.tanh(mul_102) + mul_102 = None + add_84 = 1.0 + tanh_16 + tanh_16 = None + hidden_states_17 = mul_98 * add_84 + mul_98 = add_84 = None + intermediate_output_16 = torch._C._nn.linear( + hidden_states_17, + l_self_modules_h_modules_16_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_16_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_17 = l_self_modules_h_modules_16_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_16_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_66 = torch.nn.functional.dropout( + intermediate_output_16, p=0.0, training=False + ) + intermediate_output_16 = None + out_67 = out_65 + out_66 + out_65 = out_66 = None + layernorm_output_34 = torch.nn.functional.layer_norm( + out_67, + (1024,), + l_self_modules_h_modules_17_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_17_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_17_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_17_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_34 = torch._C._nn.linear( + layernorm_output_34, + l_self_modules_h_modules_17_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_17_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_34 = l_self_modules_h_modules_17_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_17_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_35 = fused_qkv_34.view(1, 2, 16, 3, 64) + fused_qkv_34 = None + getitem_74 = fused_qkv_35[(Ellipsis, 0, slice(None, None, None))] + query_layer_34 = getitem_74.transpose(1, 2) + getitem_74 = None + getitem_75 = fused_qkv_35[(Ellipsis, 1, slice(None, None, None))] + key_layer_34 = getitem_75.transpose(1, 2) + getitem_75 = None + getitem_76 = fused_qkv_35[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_35 = None + value_layer_34 = getitem_76.transpose(1, 2) + getitem_76 = None + query_layer_35 = query_layer_34.reshape(16, -1, 64) + query_layer_34 = None + reshape_71 = key_layer_34.reshape(16, -1, 64) + key_layer_35 = reshape_71.transpose(-1, -2) + reshape_71 = None + value_layer_35 = value_layer_34.reshape(16, -1, 64) + attention_scores_17 = alibi_1.baddbmm( + batch1=query_layer_35, batch2=key_layer_35, beta=1.0, alpha=0.125 + ) + query_layer_35 = key_layer_35 = None + attn_weights_34 = attention_scores_17.view(1, 16, 2, -1) + attention_scores_17 = None + causal_mask_22 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_35 = attn_weights_34 + causal_mask_22 + attn_weights_34 = causal_mask_22 = None + softmax_17 = torch.nn.functional.softmax( + attn_weights_35, dim=-1, dtype=torch.float32 + ) + attn_weights_35 = None + attention_probs_34 = softmax_17.to(torch.float32) + softmax_17 = None + attention_probs_35 = torch.nn.functional.dropout( + attention_probs_34, 0.0, False, False + ) + attention_probs_34 = None + attention_probs_reshaped_17 = attention_probs_35.view(16, 2, -1) + attention_probs_35 = None + context_layer_34 = torch.bmm(attention_probs_reshaped_17, value_layer_35) + attention_probs_reshaped_17 = value_layer_35 = None + x_34 = context_layer_34.view(1, 16, 2, 64) + context_layer_34 = None + x_35 = x_34.permute(0, 2, 1, 3) + x_34 = None + context_layer_35 = x_35.reshape(1, 2, 1024) + x_35 = None + output_tensor_17 = torch._C._nn.linear( + context_layer_35, + l_self_modules_h_modules_17_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_17_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_35 = l_self_modules_h_modules_17_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_17_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_68 = torch.nn.functional.dropout(output_tensor_17, p=0.0, training=False) + output_tensor_17 = None + out_69 = out_67 + out_68 + out_67 = out_68 = None + layernorm_output_35 = torch.nn.functional.layer_norm( + out_69, + (1024,), + l_self_modules_h_modules_17_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_17_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_17_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_h_modules_17_modules_post_attention_layernorm_parameters_bias_ = (None) + linear_70 = torch._C._nn.linear( + layernorm_output_35, + l_self_modules_h_modules_17_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_17_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_35 = l_self_modules_h_modules_17_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_17_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_104 = linear_70 * 0.5 + mul_105 = 0.79788456 * linear_70 + mul_106 = 0.044715 * linear_70 + mul_107 = mul_106 * linear_70 + mul_106 = linear_70 = None + add_88 = 1 + mul_107 + mul_107 = None + mul_108 = mul_105 * add_88 + mul_105 = add_88 = None + tanh_17 = torch.tanh(mul_108) + mul_108 = None + add_89 = 1.0 + tanh_17 + tanh_17 = None + hidden_states_18 = mul_104 * add_89 + mul_104 = add_89 = None + intermediate_output_17 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_h_modules_17_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_17_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_18 = l_self_modules_h_modules_17_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_17_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_70 = torch.nn.functional.dropout( + intermediate_output_17, p=0.0, training=False + ) + intermediate_output_17 = None + out_71 = out_69 + out_70 + out_69 = out_70 = None + layernorm_output_36 = torch.nn.functional.layer_norm( + out_71, + (1024,), + l_self_modules_h_modules_18_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_18_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_18_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_18_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_36 = torch._C._nn.linear( + layernorm_output_36, + l_self_modules_h_modules_18_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_18_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_36 = l_self_modules_h_modules_18_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_18_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_37 = fused_qkv_36.view(1, 2, 16, 3, 64) + fused_qkv_36 = None + getitem_78 = fused_qkv_37[(Ellipsis, 0, slice(None, None, None))] + query_layer_36 = getitem_78.transpose(1, 2) + getitem_78 = None + getitem_79 = fused_qkv_37[(Ellipsis, 1, slice(None, None, None))] + key_layer_36 = getitem_79.transpose(1, 2) + getitem_79 = None + getitem_80 = fused_qkv_37[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_37 = None + value_layer_36 = getitem_80.transpose(1, 2) + getitem_80 = None + query_layer_37 = query_layer_36.reshape(16, -1, 64) + query_layer_36 = None + reshape_75 = key_layer_36.reshape(16, -1, 64) + key_layer_37 = reshape_75.transpose(-1, -2) + reshape_75 = None + value_layer_37 = value_layer_36.reshape(16, -1, 64) + attention_scores_18 = alibi_1.baddbmm( + batch1=query_layer_37, batch2=key_layer_37, beta=1.0, alpha=0.125 + ) + query_layer_37 = key_layer_37 = None + attn_weights_36 = attention_scores_18.view(1, 16, 2, -1) + attention_scores_18 = None + causal_mask_23 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_37 = attn_weights_36 + causal_mask_23 + attn_weights_36 = causal_mask_23 = None + softmax_18 = torch.nn.functional.softmax( + attn_weights_37, dim=-1, dtype=torch.float32 + ) + attn_weights_37 = None + attention_probs_36 = softmax_18.to(torch.float32) + softmax_18 = None + attention_probs_37 = torch.nn.functional.dropout( + attention_probs_36, 0.0, False, False + ) + attention_probs_36 = None + attention_probs_reshaped_18 = attention_probs_37.view(16, 2, -1) + attention_probs_37 = None + context_layer_36 = torch.bmm(attention_probs_reshaped_18, value_layer_37) + attention_probs_reshaped_18 = value_layer_37 = None + x_36 = context_layer_36.view(1, 16, 2, 64) + context_layer_36 = None + x_37 = x_36.permute(0, 2, 1, 3) + x_36 = None + context_layer_37 = x_37.reshape(1, 2, 1024) + x_37 = None + output_tensor_18 = torch._C._nn.linear( + context_layer_37, + l_self_modules_h_modules_18_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_18_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_37 = l_self_modules_h_modules_18_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_18_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_72 = torch.nn.functional.dropout(output_tensor_18, p=0.0, training=False) + output_tensor_18 = None + out_73 = out_71 + out_72 + out_71 = out_72 = None + layernorm_output_37 = torch.nn.functional.layer_norm( + out_73, + (1024,), + l_self_modules_h_modules_18_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_18_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_18_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_h_modules_18_modules_post_attention_layernorm_parameters_bias_ = (None) + linear_74 = torch._C._nn.linear( + layernorm_output_37, + l_self_modules_h_modules_18_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_18_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_37 = l_self_modules_h_modules_18_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_18_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_110 = linear_74 * 0.5 + mul_111 = 0.79788456 * linear_74 + mul_112 = 0.044715 * linear_74 + mul_113 = mul_112 * linear_74 + mul_112 = linear_74 = None + add_93 = 1 + mul_113 + mul_113 = None + mul_114 = mul_111 * add_93 + mul_111 = add_93 = None + tanh_18 = torch.tanh(mul_114) + mul_114 = None + add_94 = 1.0 + tanh_18 + tanh_18 = None + hidden_states_19 = mul_110 * add_94 + mul_110 = add_94 = None + intermediate_output_18 = torch._C._nn.linear( + hidden_states_19, + l_self_modules_h_modules_18_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_18_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_19 = l_self_modules_h_modules_18_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_18_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_74 = torch.nn.functional.dropout( + intermediate_output_18, p=0.0, training=False + ) + intermediate_output_18 = None + out_75 = out_73 + out_74 + out_73 = out_74 = None + layernorm_output_38 = torch.nn.functional.layer_norm( + out_75, + (1024,), + l_self_modules_h_modules_19_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_19_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_19_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_19_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_38 = torch._C._nn.linear( + layernorm_output_38, + l_self_modules_h_modules_19_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_19_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_38 = l_self_modules_h_modules_19_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_19_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_39 = fused_qkv_38.view(1, 2, 16, 3, 64) + fused_qkv_38 = None + getitem_82 = fused_qkv_39[(Ellipsis, 0, slice(None, None, None))] + query_layer_38 = getitem_82.transpose(1, 2) + getitem_82 = None + getitem_83 = fused_qkv_39[(Ellipsis, 1, slice(None, None, None))] + key_layer_38 = getitem_83.transpose(1, 2) + getitem_83 = None + getitem_84 = fused_qkv_39[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_39 = None + value_layer_38 = getitem_84.transpose(1, 2) + getitem_84 = None + query_layer_39 = query_layer_38.reshape(16, -1, 64) + query_layer_38 = None + reshape_79 = key_layer_38.reshape(16, -1, 64) + key_layer_39 = reshape_79.transpose(-1, -2) + reshape_79 = None + value_layer_39 = value_layer_38.reshape(16, -1, 64) + attention_scores_19 = alibi_1.baddbmm( + batch1=query_layer_39, batch2=key_layer_39, beta=1.0, alpha=0.125 + ) + query_layer_39 = key_layer_39 = None + attn_weights_38 = attention_scores_19.view(1, 16, 2, -1) + attention_scores_19 = None + causal_mask_24 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_39 = attn_weights_38 + causal_mask_24 + attn_weights_38 = causal_mask_24 = None + softmax_19 = torch.nn.functional.softmax( + attn_weights_39, dim=-1, dtype=torch.float32 + ) + attn_weights_39 = None + attention_probs_38 = softmax_19.to(torch.float32) + softmax_19 = None + attention_probs_39 = torch.nn.functional.dropout( + attention_probs_38, 0.0, False, False + ) + attention_probs_38 = None + attention_probs_reshaped_19 = attention_probs_39.view(16, 2, -1) + attention_probs_39 = None + context_layer_38 = torch.bmm(attention_probs_reshaped_19, value_layer_39) + attention_probs_reshaped_19 = value_layer_39 = None + x_38 = context_layer_38.view(1, 16, 2, 64) + context_layer_38 = None + x_39 = x_38.permute(0, 2, 1, 3) + x_38 = None + context_layer_39 = x_39.reshape(1, 2, 1024) + x_39 = None + output_tensor_19 = torch._C._nn.linear( + context_layer_39, + l_self_modules_h_modules_19_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_19_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_39 = l_self_modules_h_modules_19_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_19_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_76 = torch.nn.functional.dropout(output_tensor_19, p=0.0, training=False) + output_tensor_19 = None + out_77 = out_75 + out_76 + out_75 = out_76 = None + layernorm_output_39 = torch.nn.functional.layer_norm( + out_77, + (1024,), + l_self_modules_h_modules_19_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_19_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_19_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_h_modules_19_modules_post_attention_layernorm_parameters_bias_ = (None) + linear_78 = torch._C._nn.linear( + layernorm_output_39, + l_self_modules_h_modules_19_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_19_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_39 = l_self_modules_h_modules_19_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_19_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_116 = linear_78 * 0.5 + mul_117 = 0.79788456 * linear_78 + mul_118 = 0.044715 * linear_78 + mul_119 = mul_118 * linear_78 + mul_118 = linear_78 = None + add_98 = 1 + mul_119 + mul_119 = None + mul_120 = mul_117 * add_98 + mul_117 = add_98 = None + tanh_19 = torch.tanh(mul_120) + mul_120 = None + add_99 = 1.0 + tanh_19 + tanh_19 = None + hidden_states_20 = mul_116 * add_99 + mul_116 = add_99 = None + intermediate_output_19 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_h_modules_19_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_19_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_20 = l_self_modules_h_modules_19_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_19_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_78 = torch.nn.functional.dropout( + intermediate_output_19, p=0.0, training=False + ) + intermediate_output_19 = None + out_79 = out_77 + out_78 + out_77 = out_78 = None + layernorm_output_40 = torch.nn.functional.layer_norm( + out_79, + (1024,), + l_self_modules_h_modules_20_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_20_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_20_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_20_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_40 = torch._C._nn.linear( + layernorm_output_40, + l_self_modules_h_modules_20_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_20_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_40 = l_self_modules_h_modules_20_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_20_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_41 = fused_qkv_40.view(1, 2, 16, 3, 64) + fused_qkv_40 = None + getitem_86 = fused_qkv_41[(Ellipsis, 0, slice(None, None, None))] + query_layer_40 = getitem_86.transpose(1, 2) + getitem_86 = None + getitem_87 = fused_qkv_41[(Ellipsis, 1, slice(None, None, None))] + key_layer_40 = getitem_87.transpose(1, 2) + getitem_87 = None + getitem_88 = fused_qkv_41[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_41 = None + value_layer_40 = getitem_88.transpose(1, 2) + getitem_88 = None + query_layer_41 = query_layer_40.reshape(16, -1, 64) + query_layer_40 = None + reshape_83 = key_layer_40.reshape(16, -1, 64) + key_layer_41 = reshape_83.transpose(-1, -2) + reshape_83 = None + value_layer_41 = value_layer_40.reshape(16, -1, 64) + attention_scores_20 = alibi_1.baddbmm( + batch1=query_layer_41, batch2=key_layer_41, beta=1.0, alpha=0.125 + ) + query_layer_41 = key_layer_41 = None + attn_weights_40 = attention_scores_20.view(1, 16, 2, -1) + attention_scores_20 = None + causal_mask_25 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_41 = attn_weights_40 + causal_mask_25 + attn_weights_40 = causal_mask_25 = None + softmax_20 = torch.nn.functional.softmax( + attn_weights_41, dim=-1, dtype=torch.float32 + ) + attn_weights_41 = None + attention_probs_40 = softmax_20.to(torch.float32) + softmax_20 = None + attention_probs_41 = torch.nn.functional.dropout( + attention_probs_40, 0.0, False, False + ) + attention_probs_40 = None + attention_probs_reshaped_20 = attention_probs_41.view(16, 2, -1) + attention_probs_41 = None + context_layer_40 = torch.bmm(attention_probs_reshaped_20, value_layer_41) + attention_probs_reshaped_20 = value_layer_41 = None + x_40 = context_layer_40.view(1, 16, 2, 64) + context_layer_40 = None + x_41 = x_40.permute(0, 2, 1, 3) + x_40 = None + context_layer_41 = x_41.reshape(1, 2, 1024) + x_41 = None + output_tensor_20 = torch._C._nn.linear( + context_layer_41, + l_self_modules_h_modules_20_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_20_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_41 = l_self_modules_h_modules_20_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_20_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_80 = torch.nn.functional.dropout(output_tensor_20, p=0.0, training=False) + output_tensor_20 = None + out_81 = out_79 + out_80 + out_79 = out_80 = None + layernorm_output_41 = torch.nn.functional.layer_norm( + out_81, + (1024,), + l_self_modules_h_modules_20_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_20_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_20_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_h_modules_20_modules_post_attention_layernorm_parameters_bias_ = (None) + linear_82 = torch._C._nn.linear( + layernorm_output_41, + l_self_modules_h_modules_20_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_20_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_41 = l_self_modules_h_modules_20_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_20_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_122 = linear_82 * 0.5 + mul_123 = 0.79788456 * linear_82 + mul_124 = 0.044715 * linear_82 + mul_125 = mul_124 * linear_82 + mul_124 = linear_82 = None + add_103 = 1 + mul_125 + mul_125 = None + mul_126 = mul_123 * add_103 + mul_123 = add_103 = None + tanh_20 = torch.tanh(mul_126) + mul_126 = None + add_104 = 1.0 + tanh_20 + tanh_20 = None + hidden_states_21 = mul_122 * add_104 + mul_122 = add_104 = None + intermediate_output_20 = torch._C._nn.linear( + hidden_states_21, + l_self_modules_h_modules_20_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_20_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_21 = l_self_modules_h_modules_20_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_20_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_82 = torch.nn.functional.dropout( + intermediate_output_20, p=0.0, training=False + ) + intermediate_output_20 = None + out_83 = out_81 + out_82 + out_81 = out_82 = None + layernorm_output_42 = torch.nn.functional.layer_norm( + out_83, + (1024,), + l_self_modules_h_modules_21_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_21_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_21_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_21_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_42 = torch._C._nn.linear( + layernorm_output_42, + l_self_modules_h_modules_21_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_21_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_42 = l_self_modules_h_modules_21_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_21_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_43 = fused_qkv_42.view(1, 2, 16, 3, 64) + fused_qkv_42 = None + getitem_90 = fused_qkv_43[(Ellipsis, 0, slice(None, None, None))] + query_layer_42 = getitem_90.transpose(1, 2) + getitem_90 = None + getitem_91 = fused_qkv_43[(Ellipsis, 1, slice(None, None, None))] + key_layer_42 = getitem_91.transpose(1, 2) + getitem_91 = None + getitem_92 = fused_qkv_43[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_43 = None + value_layer_42 = getitem_92.transpose(1, 2) + getitem_92 = None + query_layer_43 = query_layer_42.reshape(16, -1, 64) + query_layer_42 = None + reshape_87 = key_layer_42.reshape(16, -1, 64) + key_layer_43 = reshape_87.transpose(-1, -2) + reshape_87 = None + value_layer_43 = value_layer_42.reshape(16, -1, 64) + attention_scores_21 = alibi_1.baddbmm( + batch1=query_layer_43, batch2=key_layer_43, beta=1.0, alpha=0.125 + ) + query_layer_43 = key_layer_43 = None + attn_weights_42 = attention_scores_21.view(1, 16, 2, -1) + attention_scores_21 = None + causal_mask_26 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_43 = attn_weights_42 + causal_mask_26 + attn_weights_42 = causal_mask_26 = None + softmax_21 = torch.nn.functional.softmax( + attn_weights_43, dim=-1, dtype=torch.float32 + ) + attn_weights_43 = None + attention_probs_42 = softmax_21.to(torch.float32) + softmax_21 = None + attention_probs_43 = torch.nn.functional.dropout( + attention_probs_42, 0.0, False, False + ) + attention_probs_42 = None + attention_probs_reshaped_21 = attention_probs_43.view(16, 2, -1) + attention_probs_43 = None + context_layer_42 = torch.bmm(attention_probs_reshaped_21, value_layer_43) + attention_probs_reshaped_21 = value_layer_43 = None + x_42 = context_layer_42.view(1, 16, 2, 64) + context_layer_42 = None + x_43 = x_42.permute(0, 2, 1, 3) + x_42 = None + context_layer_43 = x_43.reshape(1, 2, 1024) + x_43 = None + output_tensor_21 = torch._C._nn.linear( + context_layer_43, + l_self_modules_h_modules_21_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_21_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_43 = l_self_modules_h_modules_21_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_21_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_84 = torch.nn.functional.dropout(output_tensor_21, p=0.0, training=False) + output_tensor_21 = None + out_85 = out_83 + out_84 + out_83 = out_84 = None + layernorm_output_43 = torch.nn.functional.layer_norm( + out_85, + (1024,), + l_self_modules_h_modules_21_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_21_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_21_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_h_modules_21_modules_post_attention_layernorm_parameters_bias_ = (None) + linear_86 = torch._C._nn.linear( + layernorm_output_43, + l_self_modules_h_modules_21_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_21_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_43 = l_self_modules_h_modules_21_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_21_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_128 = linear_86 * 0.5 + mul_129 = 0.79788456 * linear_86 + mul_130 = 0.044715 * linear_86 + mul_131 = mul_130 * linear_86 + mul_130 = linear_86 = None + add_108 = 1 + mul_131 + mul_131 = None + mul_132 = mul_129 * add_108 + mul_129 = add_108 = None + tanh_21 = torch.tanh(mul_132) + mul_132 = None + add_109 = 1.0 + tanh_21 + tanh_21 = None + hidden_states_22 = mul_128 * add_109 + mul_128 = add_109 = None + intermediate_output_21 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_h_modules_21_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_21_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_22 = l_self_modules_h_modules_21_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_21_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_86 = torch.nn.functional.dropout( + intermediate_output_21, p=0.0, training=False + ) + intermediate_output_21 = None + out_87 = out_85 + out_86 + out_85 = out_86 = None + layernorm_output_44 = torch.nn.functional.layer_norm( + out_87, + (1024,), + l_self_modules_h_modules_22_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_22_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_22_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_22_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_44 = torch._C._nn.linear( + layernorm_output_44, + l_self_modules_h_modules_22_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_22_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_44 = l_self_modules_h_modules_22_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_22_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_45 = fused_qkv_44.view(1, 2, 16, 3, 64) + fused_qkv_44 = None + getitem_94 = fused_qkv_45[(Ellipsis, 0, slice(None, None, None))] + query_layer_44 = getitem_94.transpose(1, 2) + getitem_94 = None + getitem_95 = fused_qkv_45[(Ellipsis, 1, slice(None, None, None))] + key_layer_44 = getitem_95.transpose(1, 2) + getitem_95 = None + getitem_96 = fused_qkv_45[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_45 = None + value_layer_44 = getitem_96.transpose(1, 2) + getitem_96 = None + query_layer_45 = query_layer_44.reshape(16, -1, 64) + query_layer_44 = None + reshape_91 = key_layer_44.reshape(16, -1, 64) + key_layer_45 = reshape_91.transpose(-1, -2) + reshape_91 = None + value_layer_45 = value_layer_44.reshape(16, -1, 64) + attention_scores_22 = alibi_1.baddbmm( + batch1=query_layer_45, batch2=key_layer_45, beta=1.0, alpha=0.125 + ) + query_layer_45 = key_layer_45 = None + attn_weights_44 = attention_scores_22.view(1, 16, 2, -1) + attention_scores_22 = None + causal_mask_27 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + attn_weights_45 = attn_weights_44 + causal_mask_27 + attn_weights_44 = causal_mask_27 = None + softmax_22 = torch.nn.functional.softmax( + attn_weights_45, dim=-1, dtype=torch.float32 + ) + attn_weights_45 = None + attention_probs_44 = softmax_22.to(torch.float32) + softmax_22 = None + attention_probs_45 = torch.nn.functional.dropout( + attention_probs_44, 0.0, False, False + ) + attention_probs_44 = None + attention_probs_reshaped_22 = attention_probs_45.view(16, 2, -1) + attention_probs_45 = None + context_layer_44 = torch.bmm(attention_probs_reshaped_22, value_layer_45) + attention_probs_reshaped_22 = value_layer_45 = None + x_44 = context_layer_44.view(1, 16, 2, 64) + context_layer_44 = None + x_45 = x_44.permute(0, 2, 1, 3) + x_44 = None + context_layer_45 = x_45.reshape(1, 2, 1024) + x_45 = None + output_tensor_22 = torch._C._nn.linear( + context_layer_45, + l_self_modules_h_modules_22_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_22_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_45 = l_self_modules_h_modules_22_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_22_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_88 = torch.nn.functional.dropout(output_tensor_22, p=0.0, training=False) + output_tensor_22 = None + out_89 = out_87 + out_88 + out_87 = out_88 = None + layernorm_output_45 = torch.nn.functional.layer_norm( + out_89, + (1024,), + l_self_modules_h_modules_22_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_22_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_22_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_h_modules_22_modules_post_attention_layernorm_parameters_bias_ = (None) + linear_90 = torch._C._nn.linear( + layernorm_output_45, + l_self_modules_h_modules_22_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_22_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_45 = l_self_modules_h_modules_22_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_22_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_134 = linear_90 * 0.5 + mul_135 = 0.79788456 * linear_90 + mul_136 = 0.044715 * linear_90 + mul_137 = mul_136 * linear_90 + mul_136 = linear_90 = None + add_113 = 1 + mul_137 + mul_137 = None + mul_138 = mul_135 * add_113 + mul_135 = add_113 = None + tanh_22 = torch.tanh(mul_138) + mul_138 = None + add_114 = 1.0 + tanh_22 + tanh_22 = None + hidden_states_23 = mul_134 * add_114 + mul_134 = add_114 = None + intermediate_output_22 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_h_modules_22_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_22_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_23 = l_self_modules_h_modules_22_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_22_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_90 = torch.nn.functional.dropout( + intermediate_output_22, p=0.0, training=False + ) + intermediate_output_22 = None + out_91 = out_89 + out_90 + out_89 = out_90 = None + layernorm_output_46 = torch.nn.functional.layer_norm( + out_91, + (1024,), + l_self_modules_h_modules_23_modules_input_layernorm_parameters_weight_, + l_self_modules_h_modules_23_modules_input_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_23_modules_input_layernorm_parameters_weight_ = ( + l_self_modules_h_modules_23_modules_input_layernorm_parameters_bias_ + ) = None + fused_qkv_46 = torch._C._nn.linear( + layernorm_output_46, + l_self_modules_h_modules_23_modules_self_attention_modules_query_key_value_parameters_weight_, + l_self_modules_h_modules_23_modules_self_attention_modules_query_key_value_parameters_bias_, + ) + layernorm_output_46 = l_self_modules_h_modules_23_modules_self_attention_modules_query_key_value_parameters_weight_ = l_self_modules_h_modules_23_modules_self_attention_modules_query_key_value_parameters_bias_ = (None) + fused_qkv_47 = fused_qkv_46.view(1, 2, 16, 3, 64) + fused_qkv_46 = None + getitem_98 = fused_qkv_47[(Ellipsis, 0, slice(None, None, None))] + query_layer_46 = getitem_98.transpose(1, 2) + getitem_98 = None + getitem_99 = fused_qkv_47[(Ellipsis, 1, slice(None, None, None))] + key_layer_46 = getitem_99.transpose(1, 2) + getitem_99 = None + getitem_100 = fused_qkv_47[(Ellipsis, 2, slice(None, None, None))] + fused_qkv_47 = None + value_layer_46 = getitem_100.transpose(1, 2) + getitem_100 = None + query_layer_47 = query_layer_46.reshape(16, -1, 64) + query_layer_46 = None + reshape_95 = key_layer_46.reshape(16, -1, 64) + key_layer_47 = reshape_95.transpose(-1, -2) + reshape_95 = None + value_layer_47 = value_layer_46.reshape(16, -1, 64) + attention_scores_23 = alibi_1.baddbmm( + batch1=query_layer_47, batch2=key_layer_47, beta=1.0, alpha=0.125 + ) + alibi_1 = query_layer_47 = key_layer_47 = None + attn_weights_46 = attention_scores_23.view(1, 16, 2, -1) + attention_scores_23 = None + causal_mask_28 = causal_mask_4[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 2, None), + ) + ] + causal_mask_4 = None + attn_weights_47 = attn_weights_46 + causal_mask_28 + attn_weights_46 = causal_mask_28 = None + softmax_23 = torch.nn.functional.softmax( + attn_weights_47, dim=-1, dtype=torch.float32 + ) + attn_weights_47 = None + attention_probs_46 = softmax_23.to(torch.float32) + softmax_23 = None + attention_probs_47 = torch.nn.functional.dropout( + attention_probs_46, 0.0, False, False + ) + attention_probs_46 = None + attention_probs_reshaped_23 = attention_probs_47.view(16, 2, -1) + attention_probs_47 = None + context_layer_46 = torch.bmm(attention_probs_reshaped_23, value_layer_47) + attention_probs_reshaped_23 = value_layer_47 = None + x_46 = context_layer_46.view(1, 16, 2, 64) + context_layer_46 = None + x_47 = x_46.permute(0, 2, 1, 3) + x_46 = None + context_layer_47 = x_47.reshape(1, 2, 1024) + x_47 = None + output_tensor_23 = torch._C._nn.linear( + context_layer_47, + l_self_modules_h_modules_23_modules_self_attention_modules_dense_parameters_weight_, + l_self_modules_h_modules_23_modules_self_attention_modules_dense_parameters_bias_, + ) + context_layer_47 = l_self_modules_h_modules_23_modules_self_attention_modules_dense_parameters_weight_ = l_self_modules_h_modules_23_modules_self_attention_modules_dense_parameters_bias_ = (None) + out_92 = torch.nn.functional.dropout(output_tensor_23, p=0.0, training=False) + output_tensor_23 = None + out_93 = out_91 + out_92 + out_91 = out_92 = None + layernorm_output_47 = torch.nn.functional.layer_norm( + out_93, + (1024,), + l_self_modules_h_modules_23_modules_post_attention_layernorm_parameters_weight_, + l_self_modules_h_modules_23_modules_post_attention_layernorm_parameters_bias_, + 1e-05, + ) + l_self_modules_h_modules_23_modules_post_attention_layernorm_parameters_weight_ = l_self_modules_h_modules_23_modules_post_attention_layernorm_parameters_bias_ = (None) + linear_94 = torch._C._nn.linear( + layernorm_output_47, + l_self_modules_h_modules_23_modules_mlp_modules_dense_h_to_4h_parameters_weight_, + l_self_modules_h_modules_23_modules_mlp_modules_dense_h_to_4h_parameters_bias_, + ) + layernorm_output_47 = l_self_modules_h_modules_23_modules_mlp_modules_dense_h_to_4h_parameters_weight_ = l_self_modules_h_modules_23_modules_mlp_modules_dense_h_to_4h_parameters_bias_ = (None) + mul_140 = linear_94 * 0.5 + mul_141 = 0.79788456 * linear_94 + mul_142 = 0.044715 * linear_94 + mul_143 = mul_142 * linear_94 + mul_142 = linear_94 = None + add_118 = 1 + mul_143 + mul_143 = None + mul_144 = mul_141 * add_118 + mul_141 = add_118 = None + tanh_23 = torch.tanh(mul_144) + mul_144 = None + add_119 = 1.0 + tanh_23 + tanh_23 = None + hidden_states_24 = mul_140 * add_119 + mul_140 = add_119 = None + intermediate_output_23 = torch._C._nn.linear( + hidden_states_24, + l_self_modules_h_modules_23_modules_mlp_modules_dense_4h_to_h_parameters_weight_, + l_self_modules_h_modules_23_modules_mlp_modules_dense_4h_to_h_parameters_bias_, + ) + hidden_states_24 = l_self_modules_h_modules_23_modules_mlp_modules_dense_4h_to_h_parameters_weight_ = l_self_modules_h_modules_23_modules_mlp_modules_dense_4h_to_h_parameters_bias_ = (None) + out_94 = torch.nn.functional.dropout( + intermediate_output_23, p=0.0, training=False + ) + intermediate_output_23 = None + out_95 = out_93 + out_94 + out_93 = out_94 = None + hidden_states_25 = torch.nn.functional.layer_norm( + out_95, + (1024,), + l_self_modules_ln_f_parameters_weight_, + l_self_modules_ln_f_parameters_bias_, + 1e-05, + ) + out_95 = ( + l_self_modules_ln_f_parameters_weight_ + ) = l_self_modules_ln_f_parameters_bias_ = None + return ( + value_layer, + key_layer, + value_layer_2, + key_layer_2, + value_layer_4, + key_layer_4, + value_layer_6, + key_layer_6, + value_layer_8, + key_layer_8, + value_layer_10, + key_layer_10, + value_layer_12, + key_layer_12, + value_layer_14, + key_layer_14, + value_layer_16, + key_layer_16, + value_layer_18, + key_layer_18, + value_layer_20, + key_layer_20, + value_layer_22, + key_layer_22, + value_layer_24, + key_layer_24, + value_layer_26, + key_layer_26, + value_layer_28, + key_layer_28, + value_layer_30, + key_layer_30, + value_layer_32, + key_layer_32, + value_layer_34, + key_layer_34, + value_layer_36, + key_layer_36, + value_layer_38, + key_layer_38, + value_layer_40, + key_layer_40, + value_layer_42, + key_layer_42, + value_layer_44, + key_layer_44, + value_layer_46, + key_layer_46, + hidden_states_25, + ) diff --git a/samples/transformers-auto-model/bigscience/bloom-560m/weight_meta.py b/samples/transformers-auto-model/bigscience/bloom-560m/weight_meta.py new file mode 100644 index 000000000..a273dc5cf --- /dev/null +++ b/samples/transformers-auto-model/bigscience/bloom-560m/weight_meta.py @@ -0,0 +1,3102 @@ +class Program_weight_tensor_meta_L_inputs_embeds_: + name = "L_inputs_embeds_" + shape = [1, 2, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_word_embeddings_layernorm_parameters_weight_: + name = "L_self_modules_word_embeddings_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_word_embeddings_layernorm_parameters_bias_: + name = "L_self_modules_word_embeddings_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 2] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1] + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_0_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_0_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_0_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_0_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_0_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_0_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_post_attention_layernorm_parameters_weight_: + name = ( + "L_self_modules_h_modules_0_modules_post_attention_layernorm_parameters_weight_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_0_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_0_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_0_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_1_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_1_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_1_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_1_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_1_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_1_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_post_attention_layernorm_parameters_weight_: + name = ( + "L_self_modules_h_modules_1_modules_post_attention_layernorm_parameters_weight_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_1_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_1_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_1_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_2_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_2_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_2_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_2_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_2_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_2_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_post_attention_layernorm_parameters_weight_: + name = ( + "L_self_modules_h_modules_2_modules_post_attention_layernorm_parameters_weight_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_2_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_2_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_2_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_3_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_3_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_3_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_3_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_3_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_3_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_post_attention_layernorm_parameters_weight_: + name = ( + "L_self_modules_h_modules_3_modules_post_attention_layernorm_parameters_weight_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_3_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_3_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_3_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_4_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_4_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_4_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_4_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_4_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_4_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_post_attention_layernorm_parameters_weight_: + name = ( + "L_self_modules_h_modules_4_modules_post_attention_layernorm_parameters_weight_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_4_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_4_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_4_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_5_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_5_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_5_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_5_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_5_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_5_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_post_attention_layernorm_parameters_weight_: + name = ( + "L_self_modules_h_modules_5_modules_post_attention_layernorm_parameters_weight_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_5_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_5_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_5_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_6_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_6_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_6_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_6_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_6_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_6_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_post_attention_layernorm_parameters_weight_: + name = ( + "L_self_modules_h_modules_6_modules_post_attention_layernorm_parameters_weight_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_6_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_6_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_6_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_7_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_7_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_7_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_7_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_7_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_7_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_post_attention_layernorm_parameters_weight_: + name = ( + "L_self_modules_h_modules_7_modules_post_attention_layernorm_parameters_weight_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_7_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_7_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_7_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_8_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_8_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_8_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_8_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_8_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_8_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_post_attention_layernorm_parameters_weight_: + name = ( + "L_self_modules_h_modules_8_modules_post_attention_layernorm_parameters_weight_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_8_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_8_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_8_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_9_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_9_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_9_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_9_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_9_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_9_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_post_attention_layernorm_parameters_weight_: + name = ( + "L_self_modules_h_modules_9_modules_post_attention_layernorm_parameters_weight_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_9_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_9_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_9_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_10_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_10_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_10_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_10_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_10_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_10_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_10_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_10_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_10_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_10_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_11_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_11_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_11_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_11_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_11_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_11_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_11_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_11_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_11_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_11_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_12_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_12_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_12_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_12_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_12_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_12_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_12_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_12_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_12_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_12_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_12_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_12_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_12_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_12_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_12_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_12_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_12_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_12_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_13_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_13_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_13_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_13_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_13_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_13_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_13_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_13_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_13_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_13_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_13_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_13_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_13_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_13_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_13_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_13_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_13_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_13_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_14_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_14_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_14_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_14_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_14_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_14_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_14_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_14_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_14_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_14_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_14_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_14_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_14_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_14_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_14_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_14_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_14_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_14_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_15_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_15_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_15_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_15_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_15_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_15_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_15_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_15_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_15_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_15_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_15_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_15_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_15_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_15_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_15_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_15_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_15_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_15_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_16_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_16_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_16_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_16_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_16_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_16_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_16_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_16_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_16_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_16_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_16_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_16_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_16_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_16_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_16_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_16_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_16_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_16_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_16_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_16_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_16_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_16_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_16_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_16_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_17_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_17_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_17_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_17_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_17_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_17_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_17_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_17_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_17_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_17_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_17_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_17_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_17_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_17_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_17_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_17_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_17_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_17_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_17_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_17_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_17_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_17_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_17_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_17_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_18_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_18_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_18_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_18_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_18_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_18_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_18_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_18_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_18_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_18_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_18_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_18_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_18_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_18_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_18_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_18_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_18_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_18_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_18_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_18_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_18_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_18_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_18_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_18_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_19_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_19_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_19_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_19_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_19_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_19_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_19_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_19_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_19_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_19_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_19_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_19_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_19_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_19_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_19_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_19_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_19_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_19_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_19_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_19_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_19_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_19_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_19_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_19_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_20_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_20_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_20_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_20_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_20_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_20_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_20_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_20_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_20_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_20_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_20_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_20_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_20_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_20_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_20_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_20_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_20_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_20_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_20_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_20_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_20_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_20_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_20_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_20_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_21_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_21_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_21_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_21_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_21_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_21_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_21_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_21_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_21_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_21_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_21_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_21_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_21_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_21_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_21_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_21_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_21_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_21_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_21_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_21_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_21_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_21_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_21_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_21_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_22_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_22_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_22_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_22_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_22_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_22_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_22_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_22_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_22_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_22_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_22_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_22_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_22_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_22_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_22_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_22_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_22_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_22_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_22_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_22_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_22_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_22_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_22_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_22_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_23_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_23_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_23_modules_input_layernorm_parameters_bias_: + name = "L_self_modules_h_modules_23_modules_input_layernorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_23_modules_self_attention_modules_query_key_value_parameters_weight_: + name = "L_self_modules_h_modules_23_modules_self_attention_modules_query_key_value_parameters_weight_" + shape = [3072, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_23_modules_self_attention_modules_query_key_value_parameters_bias_: + name = "L_self_modules_h_modules_23_modules_self_attention_modules_query_key_value_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_23_modules_self_attention_modules_dense_parameters_weight_: + name = "L_self_modules_h_modules_23_modules_self_attention_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_23_modules_self_attention_modules_dense_parameters_bias_: + name = "L_self_modules_h_modules_23_modules_self_attention_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_23_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_h_modules_23_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_23_modules_post_attention_layernorm_parameters_bias_: + name = ( + "L_self_modules_h_modules_23_modules_post_attention_layernorm_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_23_modules_mlp_modules_dense_h_to_4h_parameters_weight_: + name = "L_self_modules_h_modules_23_modules_mlp_modules_dense_h_to_4h_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_23_modules_mlp_modules_dense_h_to_4h_parameters_bias_: + name = ( + "L_self_modules_h_modules_23_modules_mlp_modules_dense_h_to_4h_parameters_bias_" + ) + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_23_modules_mlp_modules_dense_4h_to_h_parameters_weight_: + name = "L_self_modules_h_modules_23_modules_mlp_modules_dense_4h_to_h_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_h_modules_23_modules_mlp_modules_dense_4h_to_h_parameters_bias_: + name = ( + "L_self_modules_h_modules_23_modules_mlp_modules_dense_4h_to_h_parameters_bias_" + ) + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_ln_f_parameters_weight_: + name = "L_self_modules_ln_f_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_ln_f_parameters_bias_: + name = "L_self_modules_ln_f_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None