diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 7022ebba..36a5f4f5 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -19,7 +19,7 @@ If the issue occurs while running a Python workload or involves a simulator cras
 
 For example:
 ```
-python3 tests/test_add.py
+python3 tests/ops/elementwise/test_add.py
 ...
 [SpikeSimulator] cmd> spike --isa rv64gcv --varch=vlen:256,elen:64 --vectorlane-size=128 \
   -m0x80000000:0x1900000000,0x2000000000:0x1000000 \
diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml
index e7ae6385..da8366af 100644
--- a/.github/workflows/pytorchsim_test.yml
+++ b/.github/workflows/pytorchsim_test.yml
@@ -33,7 +33,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_add.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/elementwise/test_add.py
 
   test_transcendental:
     name: Run test_transcendental.py
@@ -52,7 +52,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transcendental.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/elementwise/test_transcendental.py
 
   test_activation:
     name: Run test_activation.py
@@ -71,7 +71,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_activation.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/elementwise/test_activation.py
 
   test_batchnorm:
     name: Run test_batchnorm.py
@@ -90,7 +90,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_batchnorm.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/reduce/test_batchnorm.py
 
   test_bmm:
     name: Run test_bmm.py
@@ -109,7 +109,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_bmm.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/gemm/test_bmm.py
 
   test_cnn:
     name: Run test_cnn.py
@@ -128,7 +128,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_cnn.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/conv/test_cnn.py
 
   test_conv2d:
     name: Run test_conv2d.py
@@ -147,7 +147,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_conv2d.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/conv/test_conv2d.py
 
   test_cat:
     name: Run test_cat.py
@@ -166,7 +166,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_cat.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/view/test_cat.py
 
   test_matmul:
     name: Run test_matmul.py
@@ -185,7 +185,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_matmul.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/gemm/test_matmul.py
 
   test_reduce:
     name: Run test_reduce.py
@@ -204,7 +204,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_reduce.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/reduce/test_reduce.py
 
   test_softmax:
     name: Run test_softmax.py
@@ -223,7 +223,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_softmax.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/reduce/test_softmax.py
 
   test_transpose2D:
     name: Run test_transpose2D.py
@@ -242,7 +242,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transpose2D.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/view/test_transpose2D.py
 
   test_view3D_2D:
     name: Run test_view3D_2D.py
@@ -261,7 +261,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_view3D_2D.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/view/test_view3D_2D.py
 
   test_layernorm:
     name: Run test_layernorm.py
@@ -280,7 +280,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_layernorm.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/reduce/test_layernorm.py
 
   test_mlp:
     name: Run test_mlp.py
@@ -299,7 +299,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_mlp.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/models/test_mlp.py
 
   test_resnet:
     name: Run test_resnet.py
@@ -318,7 +318,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_resnet.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/models/test_resnet.py
 
       - name: Run test_resnet50.py
         run: |
@@ -326,7 +326,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_resnet.py --model_type resnet50
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/models/test_resnet.py --model_type resnet50
 
   test_mobilenet:
     name: Run test_mobilenet.py
@@ -345,7 +345,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/MobileNet/test_mobilenet.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/models/MobileNet/test_mobilenet.py
 
   test_transformer:
     name: Run test_transformer.py
@@ -364,7 +364,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transformer.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/models/test_transformer.py
 
   test_transpose3D:
     name: Run test_transpose3D.py
@@ -383,7 +383,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transpose3D.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/view/test_transpose3D.py
 
   test_sparsity:
     name: Run test_sparsity.py
@@ -402,7 +402,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_sparsity.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/sparsity/test_sparsity.py
 
   test_pool:
     name: Run test_pool.py
@@ -421,7 +421,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_pool.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/conv/test_pool.py
 
   test_perceptron:
     name: Run test_perceptron.py
@@ -440,7 +440,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_single_perceptron.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/models/test_single_perceptron.py
 
   test_fusion:
     name: Run test_fusion
@@ -459,7 +459,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_addmm_residual.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_addmm_residual.py
 
       - name: Run test_matmul_activation.py
         run: |
@@ -467,7 +467,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_activation.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_matmul_activation.py
 
       - name: Run test_matmul_scalar.py
         run: |
@@ -475,7 +475,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_scalar.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_matmul_scalar.py
 
       - name: Run test_matmul_reduction.py
         run: |
@@ -483,7 +483,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_reduction.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_matmul_reduction.py
 
       - name: Run test_bmm_reduction.py
         run: |
@@ -491,7 +491,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_bmm_reduction.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_bmm_reduction.py
 
       - name: Run test_prologue_fusion.py
         run: |
@@ -499,7 +499,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_prologue_fusion.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_prologue_fusion.py
 
       - name: Run test_transformer_fusion.py
         run: |
@@ -507,7 +507,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_transformer_fusion.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_transformer_fusion.py
 
       - name: Run test_conv_fusion.py
         run: |
@@ -515,7 +515,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_conv_fusion.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_conv_fusion.py
 
       - name: Run test_attention_fusion.py
         run: |
@@ -523,7 +523,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_attention_fusion.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_attention_fusion.py
 
       - name: Run test_matmul_vector.py
         run: |
@@ -531,7 +531,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_vector.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/fusion/test_matmul_vector.py
 
   test_moe:
     name: Run test_moe
@@ -550,7 +550,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/MoE/test_moe.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/models/MoE/test_moe.py
 
   test_mistral:
     name: Run test_mistral
@@ -569,7 +569,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/Mixtral_8x7B/test_attention.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/models/Mixtral8x7B/test_attention.py
 
   test_vit:
     name: Run test_vit
@@ -588,7 +588,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_vit.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/models/test_vit.py
 
   test_diffusion:
     name: Run test_diffusion
@@ -607,7 +607,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/Diffusion/test_diffusion.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/models/Diffusion/test_diffusion.py
 
   test_indirect:
     name: Run test_indirect
@@ -626,7 +626,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_indirect_access.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/misc/test_indirect_access.py
 
   test_scheduler:
     name: Run test_scheduler
@@ -645,7 +645,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_scheduler.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/system/test_scheduler.py
 
   test_llama:
     name: Run test_llama1&2
@@ -664,7 +664,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/Llama/test_llama.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/models/Llama/test_llama.py
 
   test_yolov5:
     name: Run test_yolov5
@@ -683,7 +683,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/Yolov5/test_yolov5.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/models/Yolov5/test_yolov5.py
 
   test_deepseek:
     name: Run test_deepseek
@@ -702,7 +702,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/DeepSeek/test_deepseek_v3_base.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/models/DeepSeek/test_deepseek_v3_base.py
 
   test_eager:
     name: Run test_eager.py
@@ -721,7 +721,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_eager.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/system/test_eager.py
 
   test_exponent:
     name: Run test_exponent.py
@@ -740,7 +740,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_exponent.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/elementwise/test_exponent.py
 
   test_sort:
     name: Run test_sort.py
@@ -759,7 +759,7 @@ jobs:
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_sort.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/ops/sort/test_sort.py
 
   test_accuracy:
     name: Run test_accuracy and test_speedup
diff --git a/CLAUDE.md b/CLAUDE.md
index 8eb99c93..4a4e7424 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -23,7 +23,7 @@ The pipeline runs in that order on every `torch.compile` invocation; you'll see
 | `TOGSim/` | C++ TOGSim source. `src/Simulator.cc`, `Core.cc`, `Dram.cc`, `Interconnect.cc`, `L2Cache.cc`, `Tile.cc`, `TileGraph.cc` are the core models. Externals: ramulator2, booksim, stonneCore, onnx, protobuf, spdlog, yaml-cpp |
 | `AsmParser/` | `tog_generator.py`, `onnx_utility.py` — TOG generation from ONNX/ASM |
 | `configs/` | TOGSim hardware configs (YAML). The default is `systolic_ws_128x128_c1_simple_noc_tpuv3.yml`. Naming pattern: `systolic_ws_<size>_c<cores>_<noc>_<target>.yml` |
-| `tests/` | ~36 op- and model-level tests. Subdirs `DeepSeek/`, `Diffusion/`, `Llama/`, `MLP/`, `Mixtral_8x7B/`, `MoE/`, `Yolov5/`, `Fusion/` for whole-model workloads |
+| `tests/` | Op- and model-level tests organized under `ops/<family>/` (elementwise, reduce, gemm, conv, attention, view, sort, sparsity, misc, fusion), `models/<name>/` (Llama, Mixtral8x7B, DeepSeek, Diffusion, MoE, MLP, MobileNet, Yolov5) plus single-file model tests (test_resnet, test_transformer, test_vit, test_mlp, test_single_perceptron), and `system/` (scheduler, eager, hetro, stonne, vectorops). Shared helper: `tests/_utils.py` |
 | `experiments/artifact/` | Paper reproduction scripts (`cycle_validation/run_cycle.sh`, `speedup/run_speedup.sh`) |
 | `scripts/` | One-off experiment runners (CompilerOpt, ILS, batch, chiplet, sparsity, stonne, end2end). `build_from_source.sh` builds gem5/llvm/spike |
 | `gem5_script/` | gem5 wrapper scripts called by `CycleSimulator` |
@@ -36,16 +36,16 @@ The pipeline runs in that order on every `torch.compile` invocation; you'll see
 Most tests follow the same pattern: build CPU reference, compile via `torch.compile` on `npu:0`, compare with `torch.allclose` (rtol=atol=1e-4). They all have `if __name__ == "__main__"` blocks.
 
 ```bash
-python tests/test_add.py        # vector add (smoke test, fastest)
-python tests/test_matmul.py     # GEMM
-python tests/test_mlp.py        # MLP forward + backward (training path)
-python tests/test_scheduler.py  # multi-tenant launch_model
-python tests/test_eager.py      # eager-fallback registration
+python tests/ops/elementwise/test_add.py        # vector add (smoke test, fastest)
+python tests/ops/gemm/test_matmul.py     # GEMM
+python tests/models/test_mlp.py        # MLP forward + backward (training path)
+python tests/system/test_scheduler.py  # multi-tenant launch_model
+python tests/system/test_eager.py      # eager-fallback registration
 ```
 
-Run a model from `tests/Llama/`, `tests/DeepSeek/`, etc. similarly.
+Run a model from `tests/models/Llama/`, `tests/models/DeepSeek/`, etc. similarly.
 
-**CI coverage:** the GitHub Actions workflow `.github/workflows/pytorchsim_test.yml` runs an **explicit allowlist** of `tests/*.py` files (~40 jobs, one Docker container per test). Adding a new file under `tests/` does *not* automatically gate PRs — register it in `pytorchsim_test.yml` if you want CI to exercise it. Conversely, files like `tests/test_gqa.py`, `tests/test_gqa_decode.py`, and `tests/test_eager.py` exist in the repo but are *not* in CI, so local validation is the only safety net for them.
+**CI coverage:** the GitHub Actions workflow `.github/workflows/pytorchsim_test.yml` runs an **explicit allowlist** of `tests/*.py` files (~40 jobs, one Docker container per test). Adding a new file under `tests/` does *not* automatically gate PRs — register it in `pytorchsim_test.yml` if you want CI to exercise it. Conversely, files like `tests/ops/attention/test_gqa.py`, `tests/ops/attention/test_gqa_decode.py`, and `tests/system/test_eager.py` exist in the repo but are *not* in CI, so local validation is the only safety net for them.
 
 **For fast iteration** (skip functional check):
 ```bash
@@ -123,7 +123,7 @@ Conan deps for TOGSim: `boost/1.79.0`, `robin-hood-hashing/3.11.5`, `spdlog/1.11
 - **Adding a PyTorch device op:** `PyTorchSimDevice/csrc/aten/native/*` (Minimal/Extra split mirrors `torch_openreg`).
 - **TOGSim hardware model changes:** `TOGSim/src/{Core,Dram,Interconnect,L2Cache,Tile,TileGraph}.cc` + matching `include/*.h`.
 - **TOG generation:** `AsmParser/tog_generator.py` builds the raw graph and serializes it via `AsmParser/onnx_utility.py` to **ONNX, which is the on-disk TOG format** consumed by TOGSim.
-- **Eager fallback registration:** `torch.npu.register_eager_to_compile([...])` — see `tests/test_eager.py`.
+- **Eager fallback registration:** `torch.npu.register_eager_to_compile([...])` — see `tests/system/test_eager.py`.
 - **Per-run results:** `togsim_results/<YYYYMMDD_HHMMSS_<hash>>.log` (stats) and `.trace` (instruction trace). The path is also printed at the end of every run.
 - **Wrapper codegen path:** printed as `Wrapper Codegen Path = /tmp/torchinductor_<user>/<hash>/...py` — useful for inspecting generated kernel code and tensor names for `SRAM_BUFFER_PLAN_PATH`.
 
diff --git a/README.md b/README.md
index 800f4761..f0bdc772 100644
--- a/README.md
+++ b/README.md
@@ -40,15 +40,15 @@ PyTorchSim **supports**:
 |---|:-:|:-:|---|
 | ResNet-18 | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ | channel last format |
 | ResNet-50 | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ | channel last format |
-| MobileNet-v2 | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ | `tests/MobileNet/` (torchvision) |
-| YOLOv5 | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ | `tests/Yolov5/` |
+| MobileNet-v2 | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ | `tests/models/MobileNet/` (torchvision) |
+| YOLOv5 | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ | `tests/models/Yolov5/` |
 | BERT | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ |  |
 | GPT-2 | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ |  |
-| ViT | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ | `tests/test_vit.py` |
+| ViT | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ | `tests/models/test_vit.py` |
 | Mistral | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ | |
 | Stable-diffusion v1 | 🤗 | ✅ |  |
-| Llama 2/3 | 🤗 | ✅ | `tests/Llama/` (blocks & decode-style paths) |
-| DeepSeek-V3 (base) | 🤗 | ✅ | `tests/DeepSeek/` — several ops(e.g., gate ops) are not cycle-modeled |
+| Llama 2/3 | 🤗 | ✅ | `tests/models/Llama/` (blocks & decode-style paths) |
+| DeepSeek-V3 (base) | 🤗 | ✅ | `tests/models/DeepSeek/` — several ops(e.g., gate ops) are not cycle-modeled |
 | Llama-4 | 🤗 | ⏳ | In development |
 | Broader model support | — | ⏳ | In development |
 <!-- ## Requirements
@@ -104,7 +104,7 @@ The script clones each dep at the tag pinned in [`thirdparty/github-releases.jso
 ### Run Examples
 The `tests` directory contains several AI workload examples.
 ```bash
-python tests/test_matmul.py 
+python tests/ops/gemm/test_matmul.py 
 ```
 The result is written to `${TORCHSIM_LOG_PATH}/togsim_result/XXX.log`. The log file contains detailed core, memory, and interconnect stats.
 
@@ -201,7 +201,7 @@ optimizer.zero_grad()
 loss.backward()
 compiled_step()
 ```
-`tests/test_mlp.py` provides an example of MLP training.
+`tests/models/test_mlp.py` provides an example of MLP training.
 
 ## One TOGSim session, one continuous log
 
@@ -243,7 +243,7 @@ with TOGSimulator(config_path=config):
 Here `synchronize()` acts as a barrier: it does not return until every `launch_model` issued **above** it has finished in the simulator. The later pair of `launch_model` calls therefore runs only after those earlier models have fully completed—so the sync is the point in the timeline where **all preceding launches are done**.
 
 ```bash
-python tests/test_scheduler.py
+python tests/system/test_scheduler.py
 ```
 
 Use a TOGSim config(`.yml`) that defines **partitions** when mapping queues to cores, for example:
diff --git a/scripts/sparsity_experiment/run.sh b/scripts/sparsity_experiment/run.sh
index 7996b5ab..d349ac6b 100755
--- a/scripts/sparsity_experiment/run.sh
+++ b/scripts/sparsity_experiment/run.sh
@@ -6,48 +6,48 @@ export TORCHSIM_FORCE_TIME_N=8
 
 OUTPUT_DIR="12GB"
 export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_12G_simple_noc.yml"
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="24GB"
 export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_24G_simple_noc.yml"
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="48GB"
 export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_48G_simple_noc.yml"
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="12GB_2core"
 export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_12G_simple_noc.yml"
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="24GB_2core"
 export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_24G_simple_noc.yml"
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="48GB_2core"
 export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_48G_simple_noc.yml"
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
-python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
+python3 /workspace/PyTorchSim/tests/ops/sparsity/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
diff --git a/scripts/stonne_experiment/run.sh b/scripts/stonne_experiment/run.sh
index 2e386d9c..e6479aa6 100755
--- a/scripts/stonne_experiment/run.sh
+++ b/scripts/stonne_experiment/run.sh
@@ -2,8 +2,8 @@
 export TORCHSIM_FORCE_TIME_M=1024
 export TORCHSIM_FORCE_TIME_K=1024
 export TORCHSIM_FORCE_TIME_N=1024
-python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config stonne_big_c1_simple_noc.yml --mode 0 > hetero/big_sparse.log
-python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml --mode 1 > hetero/big.log
-python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config heterogeneous_c2_simple_noc.yml --mode 2 > hetero/hetero.log
+python3 ../../tests/system/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config stonne_big_c1_simple_noc.yml --mode 0 > hetero/big_sparse.log
+python3 ../../tests/system/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml --mode 1 > hetero/big.log
+python3 ../../tests/system/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config heterogeneous_c2_simple_noc.yml --mode 2 > hetero/hetero.log
 
 echo "All processes completed!"
diff --git a/scripts/stonne_experiment/run_trace.sh b/scripts/stonne_experiment/run_trace.sh
index 5a4ff890..f959b07d 100755
--- a/scripts/stonne_experiment/run_trace.sh
+++ b/scripts/stonne_experiment/run_trace.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-SCRIPT="/workspace/PyTorchSim/tests/test_stonne.py"
+SCRIPT="/workspace/PyTorchSim/tests/system/test_stonne.py"
 
 SIZES=(32 64 128)
 SPARSITIES=(0.0 0.2 0.4 0.6 0.8)
diff --git a/tests/_pytorchsim_utils.py b/tests/_pytorchsim_utils.py
new file mode 100644
index 00000000..923f4bb7
--- /dev/null
+++ b/tests/_pytorchsim_utils.py
@@ -0,0 +1,44 @@
+"""Shared helpers for PyTorchSim test files.
+
+Module name is unique (not ``tests._utils``) because ``ultralytics``
+ships a top-level ``tests`` package in site-packages that would shadow it.
+
+Import with:
+
+    import os, sys
+    sys.path.insert(0, os.path.join(
+        os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+    from _pytorchsim_utils import test_result
+"""
+
+import sys
+
+import torch
+
+
+def test_result(name, out, expected, rtol=1e-4, atol=1e-4):
+    """Compare ``out`` to ``expected``; exit 1 on mismatch."""
+    out_cpu = out.cpu() if hasattr(out, "cpu") else out
+    expected_cpu = expected.cpu() if hasattr(expected, "cpu") else expected
+
+    if torch.allclose(out_cpu, expected_cpu, rtol=rtol, atol=atol):
+        msg = f"|{name} Test Passed|"
+        bar = "-" * len(msg)
+        print(bar)
+        print(msg)
+        print(bar)
+        return
+
+    msg = f"|{name} Test Failed|"
+    bar = "-" * len(msg)
+    print(bar)
+    print(msg)
+    print(bar)
+    print("custom out: ", out_cpu)
+    print("cpu out:    ", expected_cpu)
+    try:
+        max_diff = (out_cpu - expected_cpu).abs().max().item()
+        print(f"Max abs diff: {max_diff}")
+    except Exception:
+        pass
+    sys.exit(1)
diff --git a/tests/DeepSeek/test_deepseek_v3_base.py b/tests/models/DeepSeek/test_deepseek_v3_base.py
similarity index 92%
rename from tests/DeepSeek/test_deepseek_v3_base.py
rename to tests/models/DeepSeek/test_deepseek_v3_base.py
index ade787c5..5005b70b 100644
--- a/tests/DeepSeek/test_deepseek_v3_base.py
+++ b/tests/models/DeepSeek/test_deepseek_v3_base.py
@@ -4,6 +4,8 @@
 import copy
 from pathlib import Path
 import torch
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 # recursive compile for some ops that are caused by graph break
 torch.npu.register_eager_to_compile([
@@ -18,28 +20,6 @@
 ])
 
 
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    out_cpu = out.cpu()
-    max_diff = (out_cpu - cpu_out).abs().max().item()
-    mean_diff = (out_cpu - cpu_out).abs().mean().item()
-    if torch.allclose(out_cpu, cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print(f"Max absolute difference: {max_diff:.6f}")
-        print(f"Mean absolute difference: {mean_diff:.6f}")
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("NPU out: ", out_cpu)
-        print("CPU out: ", cpu_out)
-        print(f"Max absolute difference: {max_diff:.6f}")
-        print(f"Mean absolute difference: {mean_diff:.6f}")
-        exit(1)
-
 
 def _extract_logits(output):
     if isinstance(output, torch.Tensor):
diff --git a/tests/Diffusion/test_diffusion.py b/tests/models/Diffusion/test_diffusion.py
similarity index 97%
rename from tests/Diffusion/test_diffusion.py
rename to tests/models/Diffusion/test_diffusion.py
index 85eaba9f..7ad5a759 100644
--- a/tests/Diffusion/test_diffusion.py
+++ b/tests/models/Diffusion/test_diffusion.py
@@ -9,23 +9,8 @@
 from diffusers.models.upsampling import Upsample2D
 from diffusers.models.resnet import ResnetBlock2D
 from diffusers.models.embeddings import Timesteps
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        diff = torch.max(torch.abs(out.cpu() - cpu_out)).item()
-        print(f"Max abs diff: {diff}")
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 @torch.no_grad()
 def test_unet_conditional(
@@ -636,7 +621,6 @@ def test_timesteps(
     parser.add_argument("--prompt", type=str, default="a cat in a hat")
     args = parser.parse_args()
 
-    sys.path.append(os.environ.get("TORCHSIM_DIR", "/workspace/PyTorchSim"))
     device = torch.device("npu:0")
 
     #test_upsample2d(device)
diff --git a/tests/Llama/test_llama.py b/tests/models/Llama/test_llama.py
similarity index 95%
rename from tests/Llama/test_llama.py
rename to tests/models/Llama/test_llama.py
index 5e87b8e7..a4f2f1f2 100644
--- a/tests/Llama/test_llama.py
+++ b/tests/models/Llama/test_llama.py
@@ -5,19 +5,8 @@
 import torch
 from transformers.models.llama.configuration_llama import LlamaConfig
 from transformers.models.llama.modeling_llama import LlamaForCausalLM, LlamaDecoderLayer, LlamaRMSNorm, LlamaRotaryEmbedding, LlamaModel
-
-def test_result(name, out, ref, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), ref.cpu(), rtol=rtol, atol=atol):
-        msg = f"|{name} Test Passed|"
-        print("-" * len(msg)); print(msg); print("-" * len(msg))
-    else:
-        msg = f"|{name} Test Failed|"
-        print("-" * len(msg)); print(msg); print("-" * len(msg))
-        diff = (out.cpu().int() - ref.cpu().int()).abs().max().item()
-        print("device out:", out.detach().cpu())
-        print("cpu ref  :", ref.detach().cpu())
-        print(f"Max abs diff: {diff}")
-        sys.exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 @torch.no_grad()
 def run_rmsnorm_test(
diff --git a/tests/MLP/test_mlp.py b/tests/models/MLP/test_mlp.py
similarity index 93%
rename from tests/MLP/test_mlp.py
rename to tests/models/MLP/test_mlp.py
index c910729e..ac3d03e0 100644
--- a/tests/MLP/test_mlp.py
+++ b/tests/models/MLP/test_mlp.py
@@ -19,23 +19,8 @@
 import torch.utils.cpp_extension
 from torch._inductor import config
 
-sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    pass_message = f"|{name} Test Passed|"
-    fail_message = f"|{name} Test Failed|"
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        print("-" * len(pass_message))
-        print(pass_message)
-        print("-" * len(pass_message))
-    else:
-        print("-" * len(fail_message))
-        print(fail_message)
-        print("-" * len(fail_message))
-
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'), 'tests'))
+from _pytorchsim_utils import test_result
 
 class MLP(nn.Module):
     def __init__(self, input_size, output_size, hidden_size):
diff --git a/tests/MLP/test_mlp_cpu.py b/tests/models/MLP/test_mlp_cpu.py
similarity index 96%
rename from tests/MLP/test_mlp_cpu.py
rename to tests/models/MLP/test_mlp_cpu.py
index 112f5d07..620d6cd2 100644
--- a/tests/MLP/test_mlp_cpu.py
+++ b/tests/models/MLP/test_mlp_cpu.py
@@ -20,22 +20,8 @@
 import torch.utils.cpp_extension
 from torch._inductor import config
 
-sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    pass_message = f"|{name} Test Passed|"
-    fail_message = f"|{name} Test Failed|"
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        print("-" * len(pass_message))
-        print(pass_message)
-        print("-" * len(pass_message))
-    else:
-        print("-" * len(fail_message))
-        print(fail_message)
-        print("-" * len(fail_message))
-
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
+sys.path.insert(0, os.path.join(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'), 'tests'))
+from _pytorchsim_utils import test_result
 
 class MLP(nn.Module):
     def __init__(self, input_size, output_size, hidden_size):
diff --git a/tests/Mixtral_8x7B/model.py b/tests/models/Mixtral8x7B/model.py
similarity index 100%
rename from tests/Mixtral_8x7B/model.py
rename to tests/models/Mixtral8x7B/model.py
diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/models/Mixtral8x7B/test_attention.py
similarity index 90%
rename from tests/Mixtral_8x7B/test_attention.py
rename to tests/models/Mixtral8x7B/test_attention.py
index 57760370..c993c2ec 100644
--- a/tests/Mixtral_8x7B/test_attention.py
+++ b/tests/models/Mixtral8x7B/test_attention.py
@@ -1,21 +1,10 @@
+import os
+import sys
 import copy
 import torch
 from model import Transformer, TransformerBlock, ModelArgs, Attention, FeedForward, KVCache, RMSNorm, precompute_freqs_cis, sample
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_decode(device, prompt_length, nr_tokens):
     # Setup model & model args
diff --git a/tests/MoE/test_moe.py b/tests/models/MoE/test_moe.py
similarity index 98%
rename from tests/MoE/test_moe.py
rename to tests/models/MoE/test_moe.py
index d4cd98f1..6df06803 100644
--- a/tests/MoE/test_moe.py
+++ b/tests/models/MoE/test_moe.py
@@ -14,7 +14,8 @@
 import torch.utils.cpp_extension
 from torch._inductor import config
 
-sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+sys.path.insert(0, os.path.join(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'), 'tests'))
+from _pytorchsim_utils import test_result
 
 # FIXME. This is a Dynamo bug. Solution to avoid is_forward conflict during backward
 def patch_metrics_context_update():
@@ -30,22 +31,6 @@ def patched_update(values, overwrite=True):
     # Patch the method
     get_metrics_context().update = patched_update
 
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    pass_message = f"|{name} Test Passed|"
-    fail_message = f"|{name} Test Failed|"
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        print("-" * len(pass_message))
-        print(pass_message)
-        print("-" * len(pass_message))
-    else:
-        print("-" * len(fail_message))
-        print(fail_message)
-        print("-" * len(fail_message))
-
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
-
 class SparseDispatcher(object):
     """Helper for implementing a mixture of experts.
     The purpose of this class is to create input minibatches for the
diff --git a/tests/MoE/test_moe_cpu.py b/tests/models/MoE/test_moe_cpu.py
similarity index 100%
rename from tests/MoE/test_moe_cpu.py
rename to tests/models/MoE/test_moe_cpu.py
diff --git a/tests/MobileNet/test_mobilenet.py b/tests/models/MobileNet/test_mobilenet.py
similarity index 84%
rename from tests/MobileNet/test_mobilenet.py
rename to tests/models/MobileNet/test_mobilenet.py
index 966d479a..36ab41b5 100644
--- a/tests/MobileNet/test_mobilenet.py
+++ b/tests/models/MobileNet/test_mobilenet.py
@@ -1,3 +1,4 @@
+import sys
 import argparse
 import copy
 import os
@@ -6,23 +7,10 @@
 import torch._dynamo
 import torch.utils.cpp_extension
 from torchvision.models import mobilenet_v2
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
-
 
 def _mobilenet_v2():
     try:
diff --git a/tests/Yolov5/test_yolov5.py b/tests/models/Yolov5/test_yolov5.py
similarity index 94%
rename from tests/Yolov5/test_yolov5.py
rename to tests/models/Yolov5/test_yolov5.py
index d98828bd..9e93fd50 100644
--- a/tests/Yolov5/test_yolov5.py
+++ b/tests/models/Yolov5/test_yolov5.py
@@ -11,22 +11,10 @@
 from torchvision import transforms
 
 import os
+import sys
 import shutil
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def run_yolo(batch, config):
     import copy
diff --git a/tests/Fusion/__init__.py b/tests/models/__init__.py
similarity index 100%
rename from tests/Fusion/__init__.py
rename to tests/models/__init__.py
diff --git a/tests/test_mlp.py b/tests/models/test_mlp.py
similarity index 88%
rename from tests/test_mlp.py
rename to tests/models/test_mlp.py
index e3f79561..cb27fe76 100644
--- a/tests/test_mlp.py
+++ b/tests/models/test_mlp.py
@@ -1,20 +1,9 @@
+import os
+import sys
 import copy
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 class MLP(torch.nn.Module):
     def __init__(self, input_size=28*28, hidden_size=64, output_size=8):
diff --git a/tests/test_resnet.py b/tests/models/test_resnet.py
similarity index 71%
rename from tests/test_resnet.py
rename to tests/models/test_resnet.py
index 2459cd58..f9f13bd4 100644
--- a/tests/test_resnet.py
+++ b/tests/models/test_resnet.py
@@ -3,21 +3,10 @@
 import torch._dynamo
 import torch.utils.cpp_extension
 from torchvision.models import resnet18, resnet50
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+import os
+import sys
+sys.path.insert(0, os.path.join(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'), 'tests'))
+from _pytorchsim_utils import test_result
 
 def test_resnet(device, batch=1, model_type='resnet18'):
     from torchvision.models import resnet
@@ -47,7 +36,6 @@ def test_resnet(device, batch=1, model_type='resnet18'):
     args = argparse.ArgumentParser()
     args.add_argument('--model_type', type=str, default="resnet18", help='ex) resnet18')
     args = args.parse_args()
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
     device = torch.device("npu:0")
     test_resnet(device, model_type=args.model_type)
diff --git a/tests/test_single_perceptron.py b/tests/models/test_single_perceptron.py
similarity index 82%
rename from tests/test_single_perceptron.py
rename to tests/models/test_single_perceptron.py
index 7d3401a3..3b262132 100644
--- a/tests/test_single_perceptron.py
+++ b/tests/models/test_single_perceptron.py
@@ -1,20 +1,9 @@
+import os
+import sys
 import copy
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_single_perceptron(device):
     def perceptron(a, b, c):
diff --git a/tests/test_transformer.py b/tests/models/test_transformer.py
similarity index 88%
rename from tests/test_transformer.py
rename to tests/models/test_transformer.py
index 2b7f308c..2eaa9fd0 100644
--- a/tests/test_transformer.py
+++ b/tests/models/test_transformer.py
@@ -1,21 +1,10 @@
+import os
+import sys
 import math
 import copy
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def clones(module, N):
     "Produce N identical layers."
diff --git a/tests/test_vit.py b/tests/models/test_vit.py
similarity index 92%
rename from tests/test_vit.py
rename to tests/models/test_vit.py
index 6149166d..9fa8a2f0 100644
--- a/tests/test_vit.py
+++ b/tests/models/test_vit.py
@@ -4,21 +4,10 @@
 import argparse
 from torchvision import models
 from torchvision.models.vision_transformer import _vision_transformer, EncoderBlock
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+import os
+import sys
+sys.path.insert(0, os.path.join(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'), 'tests'))
+from _pytorchsim_utils import test_result
 
 def init_vit_weights(m):
     if isinstance(m, torch.nn.Linear):
@@ -201,7 +190,6 @@ def test_encoder_block_with_class_token(
 
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
     device = torch.device("npu:0")
     #test_multihead_attention(device)
     #test_encoder_block(device, seq_len=197)
diff --git a/tests/__init__.py b/tests/ops/__init__.py
similarity index 100%
rename from tests/__init__.py
rename to tests/ops/__init__.py
diff --git a/tests/ops/attention/__init__.py b/tests/ops/attention/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_gqa.py b/tests/ops/attention/test_gqa.py
similarity index 95%
rename from tests/test_gqa.py
rename to tests/ops/attention/test_gqa.py
index ba262fa6..038861c8 100644
--- a/tests/test_gqa.py
+++ b/tests/ops/attention/test_gqa.py
@@ -5,23 +5,10 @@
 import torch.nn.functional as F
 import torch._dynamo
 import argparse
+sys.path.insert(0, os.path.join(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'), 'tests'))
+from _pytorchsim_utils import test_result
 
 
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
-
 
 class GQAMultiheadAttention(nn.Module):
     """
@@ -300,7 +287,6 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
     
     args = parser.parse_args()
 
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
     device = torch.device("npu:0")
     
     test_repeat_interleave_compilation(
diff --git a/tests/test_gqa_decode.py b/tests/ops/attention/test_gqa_decode.py
similarity index 100%
rename from tests/test_gqa_decode.py
rename to tests/ops/attention/test_gqa_decode.py
diff --git a/tests/test_sdpa.py b/tests/ops/attention/test_sdpa.py
similarity index 100%
rename from tests/test_sdpa.py
rename to tests/ops/attention/test_sdpa.py
diff --git a/tests/ops/conv/__init__.py b/tests/ops/conv/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_cnn.py b/tests/ops/conv/test_cnn.py
similarity index 69%
rename from tests/test_cnn.py
rename to tests/ops/conv/test_cnn.py
index e6b01bbd..dcb4e14c 100644
--- a/tests/test_cnn.py
+++ b/tests/ops/conv/test_cnn.py
@@ -1,19 +1,8 @@
+import os
+import sys
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 class CNN(torch.nn.Module):
     def __init__(self):
diff --git a/tests/test_conv2d.py b/tests/ops/conv/test_conv2d.py
similarity index 85%
rename from tests/test_conv2d.py
rename to tests/ops/conv/test_conv2d.py
index 313003b1..820e990f 100644
--- a/tests/test_conv2d.py
+++ b/tests/ops/conv/test_conv2d.py
@@ -1,20 +1,9 @@
+import os
+import sys
 import torch
 import torch._dynamo
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_conv2d(device, batch_size=1, in_channels=8, out_channels=16, input_size=64, kernel_size=3, stride=1, padding=0):
     def custom_conv2d(a, b, bias):
diff --git a/tests/test_group_conv.py b/tests/ops/conv/test_group_conv.py
similarity index 81%
rename from tests/test_group_conv.py
rename to tests/ops/conv/test_group_conv.py
index 4f97cff6..e9becfe3 100644
--- a/tests/test_group_conv.py
+++ b/tests/ops/conv/test_group_conv.py
@@ -1,21 +1,10 @@
+import os
+import sys
 import torch
 import torch._dynamo
 from Simulator.simulator import TOGSimulator
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_group_convolution(
     device,
diff --git a/tests/test_pool.py b/tests/ops/conv/test_pool.py
similarity index 70%
rename from tests/test_pool.py
rename to tests/ops/conv/test_pool.py
index 2848e04b..91d8fce1 100644
--- a/tests/test_pool.py
+++ b/tests/ops/conv/test_pool.py
@@ -1,19 +1,8 @@
+import os
+import sys
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_maxpool(device, b=1, c=64, h=112, w=112):
     torch.manual_seed(0)
diff --git a/tests/ops/elementwise/__init__.py b/tests/ops/elementwise/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_activation.py b/tests/ops/elementwise/test_activation.py
similarity index 83%
rename from tests/test_activation.py
rename to tests/ops/elementwise/test_activation.py
index dacc102e..f1c5de96 100644
--- a/tests/test_activation.py
+++ b/tests/ops/elementwise/test_activation.py
@@ -1,22 +1,11 @@
+import os
+import sys
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
 import torch.nn.functional as F
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_ReLU(device, size=(128, 128)):
     torch.manual_seed(0)
diff --git a/tests/test_add.py b/tests/ops/elementwise/test_add.py
similarity index 76%
rename from tests/test_add.py
rename to tests/ops/elementwise/test_add.py
index 7a0d23d9..b3c94653 100644
--- a/tests/test_add.py
+++ b/tests/ops/elementwise/test_add.py
@@ -1,21 +1,10 @@
+import os
+import sys
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_vectoradd(device, size=(128, 128)):
     def vectoradd(a, b):
diff --git a/tests/ops/elementwise/test_exponent.py b/tests/ops/elementwise/test_exponent.py
new file mode 100644
index 00000000..5bc57841
--- /dev/null
+++ b/tests/ops/elementwise/test_exponent.py
@@ -0,0 +1,18 @@
+import os
+import sys
+import torch
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
+
+def test_exponent(device, size=(128, 128)):
+    def exponent(a):
+        return a.exp()
+    x = torch.randn(size).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(exponent)
+    res = opt_fn(x)
+    out = exponent(x.cpu())
+    test_result("exponent", res, out)
+
+if __name__ == "__main__":
+    device = torch.device("npu:0")
+    test_exponent(device, size=(32, 32))
diff --git a/tests/test_transcendental.py b/tests/ops/elementwise/test_transcendental.py
similarity index 78%
rename from tests/test_transcendental.py
rename to tests/ops/elementwise/test_transcendental.py
index 34546539..c3a2ee0f 100644
--- a/tests/test_transcendental.py
+++ b/tests/ops/elementwise/test_transcendental.py
@@ -1,21 +1,10 @@
+import os
+import sys
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_tanh(device, size=(128, 128)):
     def tanh(a):
diff --git a/tests/ops/fusion/__init__.py b/tests/ops/fusion/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/Fusion/test_addmm_residual.py b/tests/ops/fusion/test_addmm_residual.py
similarity index 68%
rename from tests/Fusion/test_addmm_residual.py
rename to tests/ops/fusion/test_addmm_residual.py
index a2c17207..e2fd97f1 100644
--- a/tests/Fusion/test_addmm_residual.py
+++ b/tests/ops/fusion/test_addmm_residual.py
@@ -1,19 +1,8 @@
+import os
+import sys
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_addmm_residual(device, input_size=128, hidden_size=128, output_size=128):
     def addmm_residual(a, b, c, d):
diff --git a/tests/Fusion/test_attention_fusion.py b/tests/ops/fusion/test_attention_fusion.py
similarity index 80%
rename from tests/Fusion/test_attention_fusion.py
rename to tests/ops/fusion/test_attention_fusion.py
index 93a17347..3edb589b 100644
--- a/tests/Fusion/test_attention_fusion.py
+++ b/tests/ops/fusion/test_attention_fusion.py
@@ -1,20 +1,9 @@
+import os
+import sys
 import copy
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def clones(module, N):
     "Produce N identical layers."
diff --git a/tests/Fusion/test_bmm_reduction.py b/tests/ops/fusion/test_bmm_reduction.py
similarity index 67%
rename from tests/Fusion/test_bmm_reduction.py
rename to tests/ops/fusion/test_bmm_reduction.py
index 45e31dab..a9e02725 100644
--- a/tests/Fusion/test_bmm_reduction.py
+++ b/tests/ops/fusion/test_bmm_reduction.py
@@ -1,19 +1,8 @@
+import os
+import sys
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_bmm_reduce(device, batch=12, size=512):
     def bmm(a, b):
diff --git a/tests/Fusion/test_conv_fusion.py b/tests/ops/fusion/test_conv_fusion.py
similarity index 94%
rename from tests/Fusion/test_conv_fusion.py
rename to tests/ops/fusion/test_conv_fusion.py
index bc200ff2..eb168086 100644
--- a/tests/Fusion/test_conv_fusion.py
+++ b/tests/ops/fusion/test_conv_fusion.py
@@ -1,15 +1,9 @@
+import os
+import sys
 import torch
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        print("Failed")
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
         # exit(1)
 
 def test_conv_residual(device, batch_size=1, in_channels=8, out_channels=16, input_size=64, kernel_size=3, stride=1, padding=0):
diff --git a/tests/Fusion/test_matmul_activation.py b/tests/ops/fusion/test_matmul_activation.py
similarity index 81%
rename from tests/Fusion/test_matmul_activation.py
rename to tests/ops/fusion/test_matmul_activation.py
index 232ec98d..60261ab9 100644
--- a/tests/Fusion/test_matmul_activation.py
+++ b/tests/ops/fusion/test_matmul_activation.py
@@ -1,20 +1,9 @@
+import os
+import sys
 import copy
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 class Matmul_ActivationFn(torch.nn.Module):
     def __init__(self, input_size, output_size, activation_fn):
diff --git a/tests/Fusion/test_matmul_reduction.py b/tests/ops/fusion/test_matmul_reduction.py
similarity index 85%
rename from tests/Fusion/test_matmul_reduction.py
rename to tests/ops/fusion/test_matmul_reduction.py
index 9b09214a..d5dde2af 100644
--- a/tests/Fusion/test_matmul_reduction.py
+++ b/tests/ops/fusion/test_matmul_reduction.py
@@ -1,21 +1,10 @@
+import os
+import sys
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_matmul_reduce(device, M=512, N=512, K=512):
     def matmul_fused(a, b):
diff --git a/tests/Fusion/test_matmul_scalar.py b/tests/ops/fusion/test_matmul_scalar.py
similarity index 58%
rename from tests/Fusion/test_matmul_scalar.py
rename to tests/ops/fusion/test_matmul_scalar.py
index d5a159ed..19a55518 100644
--- a/tests/Fusion/test_matmul_scalar.py
+++ b/tests/ops/fusion/test_matmul_scalar.py
@@ -1,21 +1,10 @@
+import os
+import sys
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_matmul_scalar(device):
     def matmul_fused(a, b, c):
diff --git a/tests/Fusion/test_matmul_vector.py b/tests/ops/fusion/test_matmul_vector.py
similarity index 65%
rename from tests/Fusion/test_matmul_vector.py
rename to tests/ops/fusion/test_matmul_vector.py
index f87f9432..c7e55272 100644
--- a/tests/Fusion/test_matmul_vector.py
+++ b/tests/ops/fusion/test_matmul_vector.py
@@ -1,19 +1,8 @@
+import os
+import sys
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_matmul_vector(device, size=[56, 78, 239], dim=0):
     def matmul_fused(a, b, c, d):
diff --git a/tests/Fusion/test_prologue_fusion.py b/tests/ops/fusion/test_prologue_fusion.py
similarity index 84%
rename from tests/Fusion/test_prologue_fusion.py
rename to tests/ops/fusion/test_prologue_fusion.py
index ecfd5fbf..05d89a39 100644
--- a/tests/Fusion/test_prologue_fusion.py
+++ b/tests/ops/fusion/test_prologue_fusion.py
@@ -1,19 +1,8 @@
+import os
+import sys
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_elem_broadcast_fusion(device):
     def matmul_fused(a, b, c):
diff --git a/tests/Fusion/test_transformer_fusion.py b/tests/ops/fusion/test_transformer_fusion.py
similarity index 94%
rename from tests/Fusion/test_transformer_fusion.py
rename to tests/ops/fusion/test_transformer_fusion.py
index 1581cd97..b6eb274f 100644
--- a/tests/Fusion/test_transformer_fusion.py
+++ b/tests/ops/fusion/test_transformer_fusion.py
@@ -1,21 +1,10 @@
+import os
+import sys
 import math
 import copy
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def clones(module, N):
     "Produce N identical layers."
diff --git a/tests/ops/gemm/__init__.py b/tests/ops/gemm/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_bmm.py b/tests/ops/gemm/test_bmm.py
similarity index 73%
rename from tests/test_bmm.py
rename to tests/ops/gemm/test_bmm.py
index 02a6460e..5f0ae9c9 100644
--- a/tests/test_bmm.py
+++ b/tests/ops/gemm/test_bmm.py
@@ -1,19 +1,8 @@
+import os
+import sys
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_BMM(device, batch_size=1, m=32, n=16, k=64):
     def bmm(a, b):
diff --git a/tests/test_matmul.py b/tests/ops/gemm/test_matmul.py
similarity index 86%
rename from tests/test_matmul.py
rename to tests/ops/gemm/test_matmul.py
index a5bdf422..756b99a2 100644
--- a/tests/test_matmul.py
+++ b/tests/ops/gemm/test_matmul.py
@@ -1,19 +1,8 @@
+import os
+import sys
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_matmul(device, input_size=128, hidden_size=128, output_size=128):
     def custom_matmul(a, b):
diff --git a/tests/ops/misc/__init__.py b/tests/ops/misc/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_expert_mask.py b/tests/ops/misc/test_expert_mask.py
similarity index 69%
rename from tests/test_expert_mask.py
rename to tests/ops/misc/test_expert_mask.py
index 4d240206..9da3b9ac 100644
--- a/tests/test_expert_mask.py
+++ b/tests/ops/misc/test_expert_mask.py
@@ -1,23 +1,12 @@
+import os
+import sys
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
-
 
 def test_expert_mask(device, batch=4, num_experts=8):
     # Regression test for issue #228:
diff --git a/tests/test_indirect_access.py b/tests/ops/misc/test_indirect_access.py
similarity index 84%
rename from tests/test_indirect_access.py
rename to tests/ops/misc/test_indirect_access.py
index 95167d1e..f64fe50d 100644
--- a/tests/test_indirect_access.py
+++ b/tests/ops/misc/test_indirect_access.py
@@ -1,20 +1,9 @@
+import os
+import sys
 import torch
 import copy
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_indirect_vectoradd(device, size=(128, 128)):
     def vectoradd(a, idx, b):
diff --git a/tests/ops/reduce/__init__.py b/tests/ops/reduce/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_batchnorm.py b/tests/ops/reduce/test_batchnorm.py
similarity index 63%
rename from tests/test_batchnorm.py
rename to tests/ops/reduce/test_batchnorm.py
index 065c0870..1c34942d 100644
--- a/tests/test_batchnorm.py
+++ b/tests/ops/reduce/test_batchnorm.py
@@ -1,19 +1,8 @@
+import os
+import sys
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_BatchNorm(device, size=(1, 16, 64, 64)):
     torch.manual_seed(0)
diff --git a/tests/test_layernorm.py b/tests/ops/reduce/test_layernorm.py
similarity index 66%
rename from tests/test_layernorm.py
rename to tests/ops/reduce/test_layernorm.py
index 3db27dc5..553ce7e3 100644
--- a/tests/test_layernorm.py
+++ b/tests/ops/reduce/test_layernorm.py
@@ -1,21 +1,10 @@
+import os
+import sys
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_LayerNorm(device, size=(64, 64)):
     torch.manual_seed(0)
diff --git a/tests/test_reduce.py b/tests/ops/reduce/test_reduce.py
similarity index 74%
rename from tests/test_reduce.py
rename to tests/ops/reduce/test_reduce.py
index 07f8fef2..d80e17b9 100644
--- a/tests/test_reduce.py
+++ b/tests/ops/reduce/test_reduce.py
@@ -1,21 +1,10 @@
+import os
+import sys
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_reduce_sum(device, size, dim, keepdim=False):
     def reduce_sum(a, b, dim, keepdim):
diff --git a/tests/test_softmax.py b/tests/ops/reduce/test_softmax.py
similarity index 85%
rename from tests/test_softmax.py
rename to tests/ops/reduce/test_softmax.py
index 2dca97b7..2e5e6422 100644
--- a/tests/test_softmax.py
+++ b/tests/ops/reduce/test_softmax.py
@@ -1,17 +1,10 @@
+import os
+import sys
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_softmax(device, size=(128, 128), dim=1):
     torch.manual_seed(0)
diff --git a/tests/ops/sort/__init__.py b/tests/ops/sort/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_sort.py b/tests/ops/sort/test_sort.py
similarity index 90%
rename from tests/test_sort.py
rename to tests/ops/sort/test_sort.py
index 5bce2532..d9b51087 100644
--- a/tests/test_sort.py
+++ b/tests/ops/sort/test_sort.py
@@ -1,20 +1,9 @@
+import os
+import sys
 import argparse
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out:", out.cpu())
-        print("cpu out:", cpu_out)
-        raise SystemExit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 
 def test_equal(name, out, cpu_out):
diff --git a/tests/test_topk.py b/tests/ops/sort/test_topk.py
similarity index 66%
rename from tests/test_topk.py
rename to tests/ops/sort/test_topk.py
index caf56779..e27c199d 100644
--- a/tests/test_topk.py
+++ b/tests/ops/sort/test_topk.py
@@ -1,21 +1,10 @@
+import os
+import sys
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_topk(device, size=(128, 128), k=5, dim=-1, largest=True, sorted=True):
     # dim 해석을 위해 양수 인덱스로 변환
diff --git a/tests/ops/sparsity/__init__.py b/tests/ops/sparsity/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_sparse_core.py b/tests/ops/sparsity/test_sparse_core.py
similarity index 81%
rename from tests/test_sparse_core.py
rename to tests/ops/sparsity/test_sparse_core.py
index bb4ff630..21cd9344 100644
--- a/tests/test_sparse_core.py
+++ b/tests/ops/sparsity/test_sparse_core.py
@@ -3,21 +3,10 @@
 import torch._dynamo
 import torch.utils.cpp_extension
 import torch.nn.utils.prune as prune
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+import os
+import sys
+sys.path.insert(0, os.path.join(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim'), 'tests'))
+from _pytorchsim_utils import test_result
 
 class MLP(nn.Module):
     def __init__(self, input_size=16, hidden_size=16, output_size=16, sparsity_fc1=0, sparsity_fc2=0):
@@ -79,7 +68,6 @@ def test_sparse_mlp(device, batch_size=32, input_size=128, hidden_size=128, outp
 if __name__ == "__main__":
     import os
     import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim'))
     device = torch.device("npu:0")
     test_sparse_mlp(device, batch_size=8, input_size=16, hidden_size=32, output_size=64)
     
diff --git a/tests/test_sparsity.py b/tests/ops/sparsity/test_sparsity.py
similarity index 94%
rename from tests/test_sparsity.py
rename to tests/ops/sparsity/test_sparsity.py
index eaa7c63c..50253243 100644
--- a/tests/test_sparsity.py
+++ b/tests/ops/sparsity/test_sparsity.py
@@ -7,9 +7,10 @@
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
-sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-from test_transformer import EncoderBlock, test_result
-from test_mlp import MLP
+sys.path.insert(0, os.path.join(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'), 'tests'))
+from _pytorchsim_utils import test_result
+from models.test_transformer import EncoderBlock
+from models.test_mlp import MLP
 
 def apply_random_zero(tensor, zero_prob, block_size=8):
     if not 0 <= zero_prob <= 1:
diff --git a/tests/ops/view/__init__.py b/tests/ops/view/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_cat.py b/tests/ops/view/test_cat.py
similarity index 100%
rename from tests/test_cat.py
rename to tests/ops/view/test_cat.py
diff --git a/tests/test_transpose2D.py b/tests/ops/view/test_transpose2D.py
similarity index 69%
rename from tests/test_transpose2D.py
rename to tests/ops/view/test_transpose2D.py
index 4e9807ce..dd83e9ed 100644
--- a/tests/test_transpose2D.py
+++ b/tests/ops/view/test_transpose2D.py
@@ -1,19 +1,8 @@
+import os
+import sys
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_Transpose2D(device, size=(16, 32)):
     def transpose(a):
diff --git a/tests/test_transpose3D.py b/tests/ops/view/test_transpose3D.py
similarity index 78%
rename from tests/test_transpose3D.py
rename to tests/ops/view/test_transpose3D.py
index e4d4e952..f121c713 100644
--- a/tests/test_transpose3D.py
+++ b/tests/ops/view/test_transpose3D.py
@@ -1,19 +1,8 @@
+import os
+import sys
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_Transpose3D_1(device, size=(4, 16, 32)):
     def transpose(a, b):
diff --git a/tests/test_view3D_2D.py b/tests/ops/view/test_view3D_2D.py
similarity index 67%
rename from tests/test_view3D_2D.py
rename to tests/ops/view/test_view3D_2D.py
index cc7b5e41..b6e5ffff 100644
--- a/tests/test_view3D_2D.py
+++ b/tests/ops/view/test_view3D_2D.py
@@ -1,19 +1,8 @@
+import os
+import sys
 import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+from _pytorchsim_utils import test_result
 
 def test_view3D_2D(device, size=(16, 8, 16), t_x=0, t_y=1):
     def view3D_2D(a):
diff --git a/tests/system/__init__.py b/tests/system/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_eager.py b/tests/system/test_eager.py
similarity index 100%
rename from tests/test_eager.py
rename to tests/system/test_eager.py
diff --git a/tests/test_hetro.py b/tests/system/test_hetro.py
similarity index 94%
rename from tests/test_hetro.py
rename to tests/system/test_hetro.py
index eaf145d4..38cdd41c 100644
--- a/tests/test_hetro.py
+++ b/tests/system/test_hetro.py
@@ -3,10 +3,10 @@
 import torch
 import argparse
 
-sys.path.append(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"))
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
 
 from Simulator.simulator import TOGSimulator
-from test_stonne import sparse_matmul
+from system.test_stonne import sparse_matmul
 
 
 def custom_matmul(a, b):
diff --git a/tests/test_scheduler.py b/tests/system/test_scheduler.py
similarity index 87%
rename from tests/test_scheduler.py
rename to tests/system/test_scheduler.py
index beab8054..7e361402 100644
--- a/tests/test_scheduler.py
+++ b/tests/system/test_scheduler.py
@@ -1,10 +1,14 @@
 import os
+import sys
 import torch
 from torchvision.models import resnet18 as model1
-from test_transformer import EncoderBlock as model2
-from Simulator.simulator import TOGSimulator
 
 base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+sys.path.append(base_path)
+sys.path.insert(0, os.path.join(base_path, 'tests'))
+from models.test_transformer import EncoderBlock as model2
+from Simulator.simulator import TOGSimulator
+
 config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml'
 
 target_model1 = model1().eval()
diff --git a/tests/test_stonne.py b/tests/system/test_stonne.py
similarity index 68%
rename from tests/test_stonne.py
rename to tests/system/test_stonne.py
index ac26c273..4febafd0 100644
--- a/tests/test_stonne.py
+++ b/tests/system/test_stonne.py
@@ -4,6 +4,10 @@
 import random
 import numpy as np
 import argparse
+import os
+import sys
+sys.path.insert(0, os.path.join(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim'), 'tests'))
+from _pytorchsim_utils import test_result
 
 random.seed(0)
 np.random.seed(0)
@@ -13,21 +17,6 @@ def apply_pruning(tensor, sparsity):
     mask = torch.rand_like(tensor) >= sparsity
     tensor *= mask
 
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
-
 def sparse_matmul(a, b):
     return torch.sparse.mm(a, b)
 
@@ -52,7 +41,6 @@ def test_sparse_mm(device, input_size=128, hidden_size=128, output_size=128, spa
     parser.add_argument("sparsity", nargs="?", type=float, help="%% of zero", default=0.0)
 
     args = parser.parse_args()
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim'))
  
     device = torch.device("npu:0")
     test_sparse_mm(device, args.sz, args.sz, args.sz, args.sparsity)
\ No newline at end of file
diff --git a/tests/test_vectorops.py b/tests/system/test_vectorops.py
similarity index 64%
rename from tests/test_vectorops.py
rename to tests/system/test_vectorops.py
index 90e9c0f5..b83f3cf6 100644
--- a/tests/test_vectorops.py
+++ b/tests/system/test_vectorops.py
@@ -1,16 +1,21 @@
+import os
+import sys
+
 import torch
 
+sys.path.insert(0, os.path.join(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"), "tests"))
+
 if __name__ == "__main__":
     device = torch.device("npu:0")
-    
+
     # Target shape
     seq_list = [1,128,512,2048,8192]
     d_model = 768
-    from tests.test_add import test_vectoradd
-    from tests.test_activation import test_GeLU
-    from tests.test_reduce import test_reduce_sum2
-    from tests.test_layernorm import test_LayerNorm
-    from tests.test_softmax import test_softmax
+    from ops.elementwise.test_add import test_vectoradd
+    from ops.elementwise.test_activation import test_GeLU
+    from ops.reduce.test_reduce import test_reduce_sum2
+    from ops.reduce.test_layernorm import test_LayerNorm
+    from ops.reduce.test_softmax import test_softmax
     func_list = [test_vectoradd, test_GeLU, test_reduce_sum2, test_LayerNorm, test_softmax]
     for test_func in func_list:
         for seq in seq_list:
diff --git a/tests/test_exponent.py b/tests/test_exponent.py
deleted file mode 100644
index 20f0a143..00000000
--- a/tests/test_exponent.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import torch
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
-
-def test_exponent(device, size=(128, 128)):
-    def exponent(a):
-        return a.exp()
-    x = torch.randn(size).to(device=device)
-    opt_fn = torch.compile(dynamic=False)(exponent)
-    res = opt_fn(x)
-    out = exponent(x.cpu())
-    test_result("exponent", res, out)
-
-if __name__ == "__main__":
-    device = torch.device("npu:0")
-    test_exponent(device, size=(32, 32))