In [None]:
import jittor as jt
jt.flags.use_cuda = 1
def add_same_shape(a, b):
    assert a.shape == b.shape, "The two input tensors must have the same shape"
    return jt.code(
        a.shape,  # Output Shape
        a.dtype,  # Output Type
        [a, b],   # Input Tensor List
        cuda_src='''
        __global__ void kernel_add(@ARGS_DEF) {
            @PRECALC;
            for (int i=0; i<in0_shape0; i++){
                    @out(i) = @in0(i) + @in1(i);
            }
        }
        kernel_add<<<32, 32>>>(@ARGS);
        '''       
    )             # Cuda Code

jt.flags.use_cuda = 1
a = jt.Var([1, 2, 3])
b = jt.Var([3, 4, 5, 6])
add_same_shape(a, b)

In [None]:
import jittor as jt
# 直接从op.py导入自定义函数（确保op.py与测试文件在同一目录）
from op import jt_upfirdn2d_large

# -------------------------- 1. 基础配置 --------------------------
jt.flags.use_cuda = 1  # 开启CUDA（必须）
jt.seed(42)            # 固定随机种子，结果可复现

# -------------------------- 2. 生成测试数据（模拟CIFAR-10格式） --------------------------
batch_size = 4          # 批量大小（可自定义）
in_h, in_w = 32, 32     # CIFAR-10图像尺寸
channels = 3            # RGB三通道
kernel_size = 3         # 3×3均值滤波核（简单平滑效果）

# 输入张量：(batch_size, in_h, in_w, channels)
input_tensor = jt.randn(batch_size, in_h, in_w, channels)
# 滤波核：(kernel_size, kernel_size)
kernel = jt.ones(kernel_size, kernel_size) / (kernel_size ** 2)  # 均值权重

# -------------------------- 3. 调用自定义函数（测试典型场景） --------------------------
# 场景：上采样2倍 + 下采样2倍 + 1像素填充（最终尺寸不变，仅平滑）
output_tensor = jt_upfirdn2d_large(
    input=input_tensor,
    kernel=kernel,
    up_x=2, up_y=2,    # x/y方向上采样2倍
    down_x=2, down_y=2, # x/y方向下采样2倍
    pad_x0=1, pad_x1=1, # x方向左右各填1像素
    pad_y0=1, pad_y1=1  # y方向上下各填1像素
)

# -------------------------- 4. 验证结果（简单检查） --------------------------
print("=== 简化测试结果 ===")
print(f"输入张量形状：{input_tensor.shape}")
print(f"输出张量形状：{output_tensor.shape}")
print(f"函数调用：{'成功' if output_tensor is not None else '失败'}")

# 额外检查：输出尺寸是否符合预期（此处预期与输入尺寸一致）
expected_out_shape = (batch_size, in_h, in_w, channels)
if output_tensor.shape == expected_out_shape:
    print(f"尺寸验证：通过（符合预期 {expected_out_shape}）")
else:
    print(f"尺寸验证：失败（预期 {expected_out_shape}，实际 {output_tensor.shape}）")

# 可选：打印部分输出值，确认数据合理性
print("\n输出张量前2个像素值（批量0、通道0）：")
output_tensor # 打印(0,0,0,0)和(0,0,1,0)位置的数值

In [2]:
import jittor as jt
a = jt.rand([5,2,1,4])
def jt_int_mult(a):
    x = jt.Var([1])
    return jt.code(
        x.shape,
        jt.int32,
        [a],
        cpu_src=f'''
                @out(0) = in0_shape0;
            ''' # No need to add '@' when getting shape of tensors in C++.
    )
jt.flags.use_cuda = 1
jt_int_mult(a)

jt.Var([5], dtype=int32)

In [5]:
import jittor as jt
a = jt.Var([5,2,1,4])
def jt_alias_test(a):
    return jt.code(
        a.shape,
        a.dtype,
        [a],
        cpu_src=f'''
            for (int i=0; i<in0_shape0; i++)
                @out(i) = @in0(i);
            ''' # No need to add '@' when getting shape of tensors in C++.
    )
jt.flags.use_cuda = 1
jt_alias_test(a)

jt.Var([5 2 1 4], dtype=int32)

In [8]:
import jittor as jt
a = jt.Var([5,2,1,4])
def jt_alias_test(a):
    return jt.code(
        a.shape,
        a.dtype,
        [a],
        cpu_src=f'''
            @alias(b, in0)
            @alias(res, out)
            for (int i=0; i<b_shape0; i++)
                @res(i) = @b(i);
            ''' # No need to add '@' when getting shape of tensors in C++.
    )
jt.flags.use_cuda = 1
jt_alias_test(a)

jt.Var([5 2 1 4], dtype=int32)

In [None]:
print('"Wrong inputs arguments, Please refer to examples(help(jt.numpy)).\n\nTypes of your inputs are:\n self\t= Var,\n args\t= (),\n\nThe function declarations are:\n ArrayArgs fetch_sync()\n\nFailed reason:\u001b[38;5;1m[f 0910 20:43:24.055093 84 parallel_compiler.cc:331] Error happend during compilation:\n [Error] source file location:/home/a516/.cache/jittor/jt1.3.9/g++12.3.0/py3.8.20/Linux-6.6.87.2xef/13thGenIntelRCx37/4832/default/cu12.8.61/jit/code__IN_SIZE_1__in0_dim_4__in0_type_float32__OUT_SIZE_1__out0_dim_4__out0_type_int32__HEA___hash_1fefade715b2521e_op.cc\nCompile operator(1/2)failed:Op(47:1:1:1:i1:o1:s0:g1,code->48)\n\nReason: \u001b[38;5;1m[f 0910 20:43:24.054767 44:C1 op_compiler.cc:719] \u001b[38;5;1m[f 0910 20:43:24.054761 44:C1 op_compiler.cc:719] \u001b[38;5;1m[f 0910 20:43:24.054749 44:C1 op_compiler.cc:719] \u001b[38;5;1m[f 0910 20:43:24.054727 44:C1 op_compiler.cc:687] Check failed macros.at(dim)(4) == S(args.size())(1) res dimension not matched\u001b[m\nJit compiler error:\n                @res(i) = @b(i);\u001b[m\nJit compiler error:\n    @CODE\u001b[m\nJit compiler error:\n#ifndef JIT\u001b[m\n\u001b[m",')

In [None]:
print('"Wrong inputs arguments, Please refer to examples(help(jt.numpy)).\n\nTypes of your inputs are:\n self\t= Var,\n args\t= (),\n\nThe function declarations are:\n ArrayArgs fetch_sync()\n\nFailed reason:\u001b[38;5;1m[f 0910 20:47:22.597627 48 parallel_compiler.cc:331] Error happend during compilation:\n [Error] source file location:/home/a516/.cache/jittor/jt1.3.9/g++12.3.0/py3.8.20/Linux-6.6.87.2xef/13thGenIntelRCx37/4832/default/cu12.8.61/jit/code__IN_SIZE_1__in0_dim_4__in0_type_float32__OUT_SIZE_1__out0_dim_4__out0_type_int32__HEA___hash_99721febc08a03a_op.cc\nCompile operator(1/2)failed:Op(28:1:1:1:i1:o1:s0:g1,code->29)\n\nReason: \u001b[38;5;1m[f 0910 20:47:22.596386 40:C1 op_compiler.cc:719] \u001b[38;5;1m[f 0910 20:47:22.596382 40:C1 op_compiler.cc:719] \u001b[38;5;1m[f 0910 20:47:22.596373 40:C1 op_compiler.cc:719] \u001b[38;5;1m[f 0910 20:47:22.596345 40:C1 op_compiler.cc:687] Check failed macros.at(dim)(4) == S(args.size())(1) out0 dimension not matched\u001b[m\nJit compiler error:\n                @out0(i) = @in0(i);\u001b[m\nJit compiler error:\n    @CODE\u001b[m\nJit compiler error:\n#ifndef JIT\u001b[m\n\u001b[m",')

In [15]:
import jittor as jt
a = jt.rand([5,2,1,4])
def jt_int_mult(a):
    x = jt.Var([1])
    return jt.code(
        x.shape,
        jt.int32,
        [a],
        cpu_src=f'''
                @alias(a, in0)
                @out(0) = a_shape0;
            ''' # No need to add '@' when getting shape of tensors in C++.
    )
jt.flags.use_cuda = 1
jt_int_mult(a)
# 利用 alias 只能给输入变量取别名，不能给其维度等属性取别名

jt.Var([5], dtype=int32)

In [26]:
import jittor as jt
a = jt.rand([5,2,1,4])
def jt_int_mult(a):
    x = jt.Var([1])
    return jt.code(
        x.shape,
        jt.int32,
        [a],
        cuda_src='''
            __global__ void kernel_add(@ARGS_DEF) {
                @PRECALC;
                @alias(a, in0);
                @out(0) = a_shape0;
            }
            kernel_add<<<32, 32>>>(@ARGS);
            ''' # No need to add '@' when getting shape of tensors in C++.
    )
jt.flags.use_cuda = 1
jt_int_mult(a)
# 利用 alias 只能给输入变量取别名，不能给其维度等属性取别名

jt.Var([5], dtype=int32)

In [35]:
import jittor as jt
a = jt.rand([5,2,1,4])
def jt_int_mult(a):
    x = jt.Var([1])
    return jt.code(
        x.shape,
        jt.int32,
        [a],
        cuda_src='''
            __global__ void kernel_add(@ARGS_DEF) {
                @PRECALC;
                @alias(a, in0);
                
                const int c = a_shape0;
                @out(0) = c;
            }
            kernel_add<<<32, 32>>>(@ARGS);
            ''' # No need to add '@' when getting shape of tensors in C++.
    )
jt.flags.use_cuda = 1
jt_int_mult(a)


Compiling Operators(1/1) used: 2.13s eta:    0s 


jt.Var([5], dtype=int32)

In [6]:
import jittor as jt
def upfirdn2d_large(
    input: jt.Var,    # 输入张量，形状：(major_dim, in_h, in_w, minor_dim)
    kernel: jt.Var,   # 滤波核张量，形状：(kernel_h, kernel_w)
    up_x: int,        # x方向上采样倍数
    up_y: int,        # y方向上采样倍数
    down_x: int,      # x方向下采样倍数
    down_y: int,      # y方向下采样倍数
    pad_x0: int,      # x方向左填充
    pad_x1: int,      # x方向右填充
    pad_y0: int,      # y方向上填充
    pad_y1: int       # y方向下填充
) -> jt.Var:
    """
    Upfirdn2d impletement by jittor.code
    """
    # -------------------------- 1. 计算输出张量形状（与原核函数逻辑一致）--------------------------
    major_dim = input.shape[0]  # 对应 p.major_dim（如批量维度）
    in_h = input.shape[1]       # 对应 p.in_h（输入高度）
    in_w = input.shape[2]       # 对应 p.in_w（输入宽度）
    minor_dim = input.shape[3]  # 对应 p.minor_dim（如通道维度）
    kernel_h = kernel.shape[0]  # 对应 p.kernel_h（滤波核高度）
    kernel_w = kernel.shape[1]  # 对应 p.kernel_w（滤波核宽度）

    # 计算输出高度/宽度（原核函数 p.out_h/p.out_w 的计算公式）
    out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h + down_y) // down_y
    out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w + down_x) // down_x

    # 定义线程配置参数（与原核函数一致：分块处理大任务）
    loop_major = (major_dim - 1) // 16384 + 1  # 对应 p.loop_major
    loop_x = 4                                 # 对应 p.loop_x
    block_size_x = 4                           # 对应原 blockDim.x
    block_size_y = 32                          # 对应原 blockDim.y


    # -------------------------- 2. 调用 jt.code 实现 CUDA 逻辑--------------------------
    return jt.code(
        shape=(major_dim, out_h, out_w, minor_dim),  # 输出张量形状
        dtype=input.dtype,                            # 输出数据类型与输入一致
        # inputs = [@in0, @in1, @in2, @in3, @in4, @in5,   @in6,   @in7,    @in8,   @in9]
        inputs=[input, kernel, jt.array([up_x], dtype=jt.int32), 
        jt.array([up_y], dtype=jt.int32),
        jt.array([down_x], dtype=jt.int32),
        jt.array([down_y], dtype=jt.int32),
        jt.array([pad_x0], dtype=jt.int32),
        jt.array([pad_x1], dtype=jt.int32),
        jt.array([pad_y0], dtype=jt.int32),
        jt.array([pad_y1], dtype=jt.int32)],
        # CUDA API
        cuda_header='''
        #include <cuda.h>
        #include <cuda_runtime.h>
        ''',
        cuda_src='''
            __global__ static void jt_upfirdn2d_kernel(@ARGS_DEF) {
                @PRECALC
                @alias(input, in0);
                @alias(kernel, in1);
                @alias(up_x, in2);
                @alias(up_y, in3);
                @alias(down_x, in4);
                @alias(down_y, in5);
                @alias(pad_x0, in6);
                @alias(pad_x1, in7);
                @alias(pad_y0, in8);
                @alias(pad_y1, in9);

                const int major_dim = input_shape0;
                const int in_h = input_shape1;
                const int in_w = input_shape2;
                const int minor_dim = input_shape3;
                const int kernel_h = kernel_shape0;
                const int kernel_w = kernel_shape1;
                int out_h = (in_h * @up_y(0) + @pad_y0(0) + @pad_y1(0) - kernel_shape0 + @down_y(0)) / @down_y(0);
                int out_w = (in_w * @up_x(0) + @pad_x0(0) + @pad_x1(0) - kernel_shape1 + @down_x(0)) / @down_x(0);
                const int res_int = (major_dim - 1) / 16384;
                const int loop_major = res_int + 1;
                const int loop_x = 4;

                int minor_idx = blockIdx.x * blockDim.x + threadIdx.x;  // 对应原 minor_idx
                int out_y = minor_idx / minor_dim;                     // 对应原 out_y
                minor_idx -= out_y * minor_dim;                        // 修正 minor_idx

                int out_x_base = blockIdx.y * loop_x * blockDim.y + threadIdx.y;  // 对应原 out_x_base
                int major_idx_base = blockIdx.z * loop_major;                    // 对应原 major_idx_base

                // 边界检查：超出输出范围的线程直接退出（避免无效计算）
                if (out_x_base >= out_w || out_y >= out_h || major_idx_base >= major_dim) {
                    return;
                }

                int mid_y = out_y * @down_y(0) + @up_y(0) - 1 - @pad_y0(0);  // 对应原 mid_y
                
                int in_y = min(max((mid_y >= 0 ? mid_y / @up_y(0) : (mid_y - @up_y(0) + 1) / @up_y(0)), 0), in_h);  // 对应原 floor_div
                
                int h = min(max((mid_y + kernel_h >= 0 ? (mid_y + kernel_h) / @up_y(0) : ((mid_y + kernel_h) - @up_y(0) + 1) / @up_y(0)), 0), in_h) - in_y;  // 对应原 h
                
                int kernel_y = mid_y + kernel_h - (in_y + 1) * @up_y(0);  // 对应原 kernel_y

                // 遍历批量维度（major_dim）：分 loop_major 次处理
                for (int loop_major_cnt = 0, major_idx = major_idx_base;
                     loop_major_cnt < loop_major && major_idx < major_dim;
                     loop_major_cnt++, major_idx++) {
                    
                    // 遍历 x 方向：分 loop_x 次处理（每次处理 blockDim.y 个 out_x）
                    for (int loop_x_cnt = 0, out_x = out_x_base;
                         loop_x_cnt < loop_x && out_x < out_w;
                         loop_x_cnt++, out_x += blockDim.y) {

                        int mid_x = out_x * @down_x(0) + @up_x(0) - 1 - @pad_x0(0);  // 对应原 mid_x
                        int in_x = min(max((mid_x >= 0 ? mid_x / @up_x(0) : (mid_x - @up_x(0) + 1) / @up_x(0)), 0), in_w);  // 对应原 floor_div
                        int w = min(max((mid_x + kernel_w >= 0 ? (mid_x + kernel_w) / @up_x(0) : ((mid_x + kernel_w) - @up_x(0) + 1) / @up_x(0)), 0), in_w) - in_x;  // 对应原 w
                        int kernel_x = mid_x + kernel_w - (in_x + 1) * @up_x(0);  // 对应原 kernel_x

                        float val = 0.0f;  // 存储当前输出像素的累加值

                        // 遍历滤波核 y 方向覆盖的输入行
                        for (int y = 0; y < h; y++) {
                            // 遍历滤波核 x 方向覆盖的输入列
                            for (int x = 0; x < w; x++) {
                                // 1. 读取输入张量对应位置的值（@in0 访问 input，索引：(major_idx, in_y+y, in_x+x, minor_idx)）
                                float input_val = @in0(major_idx, in_y + y, in_x + x, minor_idx);
                                // 2. 读取滤波核对应位置的权重（@in1 访问 kernel，索引：(kernel_y+y, kernel_x+x)）
                                float kernel_val = @in1(kernel_y + y, kernel_x + x);
                                // 3. 累加：输入值 × 核权重
                                val += input_val * kernel_val;
                            }
                        }
                        @out(major_idx, out_y, out_x, minor_idx) = val;
                    }
                }
            }
           jt_upfirdn2d_kernel<<<32, 32>>>(@ARGS);
        '''
    )

In [8]:
import jittor as jt
import numpy as np

jt.flags.use_cuda = 1

# -------------------------- 1. 初始化输入张量 --------------------------
batch, in_h, in_w, channels = 1, 4, 4, 1  # 小尺寸，便于验证
input_np = np.ones((batch, in_h, in_w, channels), dtype=np.float32)
input_jt = jt.array(input_np)

# 卷积核全 1，3x3
kernel_np = np.ones((3,3), dtype=np.float32)
kernel_np /= kernel_np.sum()  # 归一化
kernel_jt = jt.array(kernel_np)

# -------------------------- 2. 设置采样/填充参数 --------------------------
up_x, up_y = 2, 2
down_x, down_y = 1, 1
pad_x0, pad_x1 = 1, 1
pad_y0, pad_y1 = 1, 1

# -------------------------- 3. 调用 jt_upfirdn2d_large --------------------------
output = upfirdn2d_large(
    input=input_jt,
    kernel=kernel_jt,
    up_x=up_x,
    up_y=up_y,
    down_x=down_x,
    down_y=down_y,
    pad_x0=pad_x0,
    pad_x1=pad_x1,
    pad_y0=pad_y0,
    pad_y1=pad_y1
)

# -------------------------- 4. 转回 NumPy 查看结果 --------------------------
output_np = output.numpy()
print("输出张量形状:", output_np.shape)
print("输出张量前4x4区域:\n", output_np[0, :4, :4, 0])



输出张量形状: (1, 8, 8, 1)
输出张量前4x4区域:
 [[0.11111111 0.22222222 0.11111111 0.22222222]
 [0.11111111 0.11111111 0.11111111 0.11111111]
 [0.11111111 0.22222222 0.11111111 0.22222222]
 [0.11111111 0.11111111 0.11111111 0.11111111]]


In [4]:
import time
from functools import lru_cache

def fibonacci(n):
    if n == 1:
        return 1
    elif n == 2:
        return 1
    else:
        return fibonacci(n - 1) + fibonacci(n - 2)
@lru_cache(maxsize=None)
def new_fib(n):
    if n == 1:
        return 1
    elif n == 2:
        return 1
    else:
        return new_fib(n - 1) + new_fib(n - 2)
n = 40
t1 = time.perf_counter()
fibonacci(n)
t2 = time.perf_counter()
new_fib(n)
t3 = time.perf_counter()
print(f'Time cost:\n new_fib:{t3 - t2} \n fibonacci:{t2 - t1}')
if t2 - t1 > t3 - t2:
    print("new_fib is faster.")
elif t2 - t1 < t3 - t2:
    print("fibonacci is faster.")
else:
    print("Can not dicide which is faster in this situation.")

Time cost:
 new_fib:5.239100028120447e-05 
 fibonacci:8.127972070999022
new_fib is faster.
