Specialization:


It appears that annotations in ``` get42 ``` are optional, in a sence jit produces the same result either with or w/o
annotations. The only important one is in ```range_``` function.
```
struct Keys{
    array : [u8 * 9],
}


fn range_(mut b: i32, e: i32, body: fn(int) -> ()) -> () {
    while b < e {
        body(b++)
    }
}

fn get42(@keys : Keys, values: &[u8], @key: u8) -> u8 {
    
    let mut res = 0;

            for i in range_(0,9) {
                if keys(i) == key {
            
                    res = i;

                }
            }

    values(res)
    

} 
```

JIT area:

**Just return value with the key 7, keys are 1 .. 7, i.e. returns 'g';**


```


std::string c(c_array);
    
    std::string dummy_fun;
    dummy_fun += "extern fn dummy(vals : &[u8],key:u8) -> (){\n";
    dummy_fun += "  let b = get42(\"" + c +  "\",vals,'7');\n";    
    dummy_fun += "  print_char(b);";                              
    dummy_fun += "  print_string(\"\n\");}";                     

    std::string program = std::string((char*)fun_impala) + dummy_fun;
    auto key = anydsl_compile(program.c_str(),program.size(),3);
    typedef char (*function) (const char*,char);
    auto call = reinterpret_cast<function>(anydsl_lookup_function(key,"dummy"));
    call("abcdefghi");

```

LLVM:

**Looks like it is specialized:**

```
 ; ModuleID = 'jit'
source_filename = "jit"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@_8009 = internal global [2 x i8] c"\0A\00"

define void @dummy([0 x i8]* nocapture readonly %vals_7949, i8 %key_7950) local_unnamed_addr {

dummy_start:
  %0 = getelementptr inbounds [0 x i8], [0 x i8]* %vals_7949, i64 0, i64 6
  %1 = load i8, i8* %0, align 1
  tail call void @anydsl_print_char(i8 %1)
  tail call void @anydsl_print_string([0 x i8]* bitcast ([2 x i8]* @_8009 to [0 x i8]*))
  ret void
}

declare void @anydsl_print_char(i8) local_unnamed_addr

declare void @anydsl_print_string([0 x i8]*) local_unnamed_addr 


```



**Cuda variant:**

Compiler produces a lot of code.

```
fn get42(@keys : Keys, values: &[u8], @key: u8) -> u8 {
    let grid   = (1, 1, 1);
    let block  = (1, 1, 1);
    let device = 0;
    let buf = alloc_cuda_unified(0,sizeof[int]());
    let buf_ = bitcast[&mut[i32 * 1]](buf.data);
    buf_(0) = 0;
    let mut res = 0;
    with cuda(device, grid, block){

            for i in range_(0,9) {
                if keys.array(i) == key {// == key
                    buf_(0) = i;
                    // res = i;

                }
            }
        }

    synchronize_cuda(device);
    res = buf_(0);
    release(buf);
    values(res)
    

}

```

.cu code :

```

extern "C" {
typedef struct {
    int e[1];
} array_5547;
typedef struct {
    unsigned char e[9];
} array_5543;

__device__ inline int threadIdx_x() { return threadIdx.x; }
__device__ inline int threadIdx_y() { return threadIdx.y; }
__device__ inline int threadIdx_z() { return threadIdx.z; }
__device__ inline int blockIdx_x() { return blockIdx.x; }
__device__ inline int blockIdx_y() { return blockIdx.y; }
__device__ inline int blockIdx_z() { return blockIdx.z; }
__device__ inline int blockDim_x() { return blockDim.x; }
__device__ inline int blockDim_y() { return blockDim.y; }
__device__ inline int blockDim_z() { return blockDim.z; }
__device__ inline int gridDim_x() { return gridDim.x; }
__device__ inline int gridDim_y() { return gridDim.y; }
__device__ inline int gridDim_z() { return gridDim.z; }
__global__ void lambda_9908(char*);

__global__ __launch_bounds__ (1 * 1 * 1) void lambda_9908(char* _9911_10879) {
    array_5547* buf__10914;
    buf__10914 = (array_5547*)_9911_10879;
    
    int* _10916;
    _10916 = &buf__10914->e[0];
    int b_10885_slot;
    int* b_10885;
    b_10885 = &b_10885_slot;
    *b_10885 = 0;
    goto l10880;
    l10880: ;
        int _10886;
        _10886 = *b_10885;
        int _10887;
        _10887 = _10886;
        bool _10889;
        _10889 = _10887 < 9;
        if (_10889) goto l10890; else goto l10919;
    l10919: ;
        return ;
    l10890: ;
        int _10903;
        _10903 = *b_10885;
        int _10904;
        _10904 = _10903;
        array_5543 _10900_3;
        {
        array_5543 _10900_3_tmp = { { 49, 50, 51, 52, 53, 54, 55, 56, 0, } };
         _10900_3 = _10900_3_tmp;
        }
        
        unsigned char _10905;
        _10905 = _10900_3.e[_10904];
        int _10912;
        _10912 = 1 + _10904;
        bool _10906;
        _10906 = _10905 == 55;
        *b_10885 = _10912;
        if (_10906) goto l10907; else goto l10918;
    l10918: ;
        goto l10908;
    l10907: ;
        *_10916 = _10904;
        goto l10908;
    l10908: ;
        goto l10880;
}

}

```

llvm:

```
; ModuleID = 'jit'
source_filename = "jit"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@0 = private unnamed_addr constant [12 x i8] c"lambda_9908\00", align 1
@1 = private unnamed_addr constant [7 x i8] c"jit.cu\00", align 1
@_11497 = internal global [2 x i8] c"\0A\00"

define void @dummy([0 x i8]* %vals_11421) {
dummy_start:
  %0 = alloca [3 x i32]
  %1 = alloca [3 x i32]
  %anydsl_alloc_unified1 = alloca i8*
  %types = alloca [1 x i8]
  %aligns = alloca [1 x i32]
  %sizes = alloca [1 x i32]
  %args = alloca [1 x i8*]
  %res_11450 = alloca i32
  br label %dummy

dummy:                                            ; preds = %dummy_start
  %2 = call [0 x i8]* @anydsl_alloc_unified(i32 1, i64 4)
  br label %anydsl_alloc_unified_cont

anydsl_alloc_unified_cont:                        ; preds = %dummy
  %anydsl_alloc_unified = phi [0 x i8]* [ %2, %dummy ]
  %3 = bitcast [0 x i8]* %anydsl_alloc_unified to [1 x i32]*
  %4 = getelementptr inbounds [1 x i32], [1 x i32]* %3, i64 0, i32 0
  store i32 0, i32* %4, align 4
  store i32 0, i32* %res_11450, align 4
  %5 = bitcast [0 x i8]* %anydsl_alloc_unified to i8*
  store i8* %5, i8** %anydsl_alloc_unified1
  %6 = bitcast i8** %anydsl_alloc_unified1 to i8*
  %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %args, i32 0, i32 0
  %8 = getelementptr inbounds [1 x i32], [1 x i32]* %sizes, i32 0, i32 0
  %9 = getelementptr inbounds [1 x i32], [1 x i32]* %aligns, i32 0, i32 0
  %10 = getelementptr inbounds [1 x i8], [1 x i8]* %types, i32 0, i32 0
  store i8* %6, i8** %7
  store i32 8, i32* %8
  store i32 8, i32* %9
  store i8 1, i8* %10
  store [3 x i32] [i32 1, i32 1, i32 1], [3 x i32]* %1
  store [3 x i32] [i32 1, i32 1, i32 1], [3 x i32]* %0
  %11 = getelementptr inbounds [3 x i32], [3 x i32]* %1, i32 0, i32 0
  %12 = getelementptr inbounds [3 x i32], [3 x i32]* %0, i32 0, i32 0
  %13 = getelementptr inbounds [1 x i8*], [1 x i8*]* %args, i32 0, i32 0
  %14 = getelementptr inbounds [1 x i32], [1 x i32]* %sizes, i32 0, i32 0
  %15 = getelementptr inbounds [1 x i32], [1 x i32]* %aligns, i32 0, i32 0
  %16 = getelementptr inbounds [1 x i8], [1 x i8]* %types, i32 0, i32 0
  call void @anydsl_launch_kernel(i32 1, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @1, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8], [12 x i8]* @0, i32 0, i32 0), i32* %11, i32* %12, i8** %13, i32* %14, i32* %15, i8* %16, i32 1)
  br label %_

_:                                                ; preds = %anydsl_alloc_unified_cont
  call void @anydsl_synchronize(i32 1)
  br label %synchronize_cuda_cont

synchronize_cuda_cont:                            ; preds = %_
  %17 = load i32, i32* %4, align 4
  store i32 %17, i32* %res_11450, align 4
  call void @anydsl_release(i32 1, [0 x i8]* %anydsl_alloc_unified)
  br label %release_cont

release_cont:                                     ; preds = %synchronize_cuda_cont
  %18 = load i32, i32* %res_11450, align 4
  %19 = getelementptr inbounds [0 x i8], [0 x i8]* %vals_11421, i64 0, i32 %18
  %20 = load i8, i8* %19, align 1
  call void @anydsl_print_char(i8 %20)
  br label %anydsl_print_char_cont

anydsl_print_char_cont:                           ; preds = %release_cont
  call void @anydsl_print_string([0 x i8]* bitcast ([2 x i8]* @_11497 to [0 x i8]*))
  br label %return

return:                                           ; preds = %anydsl_print_char_cont
  ret void
}

declare [0 x i8]* @anydsl_alloc_unified(i32, i64)

declare void @anydsl_launch_kernel(i32, i8*, i8*, i32*, i32*, i8**, i32*, i32*, i8*, i32)

declare void @anydsl_synchronize(i32)

declare void @anydsl_release(i32, [0 x i8]*)

declare void @anydsl_print_char(i8)

declare void @anydsl_print_string([0 x i8]*)


```


Unspecialized : i,e. ```fn @(false) range_ ``` and **keys** without annotations (annotation results to segfault in cpu backend, while on cuda everything is ok);

output:
```
; ModuleID = 'jit'
source_filename = "jit"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@_7972 = internal global [2 x i8] c"\0A\00"
@_7935 = internal global [9 x i8] c"12345678\00"

define void @dummy([0 x i8]* %vals_7913) {
dummy_start:
  %res_7949 = alloca i32
  %b_7920 = alloca i32
  br label %dummy

dummy:                                            ; preds = %dummy_start
  store i32 0, i32* %res_7949, align 4
  store i32 0, i32* %b_7920, align 4
  br label %while_head

while_head:                                       ; preds = %body_cont, %dummy
  %0 = load i32, i32* %b_7920, align 4
  %1 = icmp slt i32 %0, 9
  br i1 %1, label %expr_true, label %expr_false

expr_false:                                       ; preds = %while_head
  %2 = load i32, i32* %res_7949, align 4
  %3 = getelementptr inbounds [0 x i8], [0 x i8]* %vals_7913, i64 0, i32 %2
  %4 = load i8, i8* %3, align 1
  call void @anydsl_print_char(i8 %4)
  br label %anydsl_print_char_cont

anydsl_print_char_cont:                           ; preds = %expr_false
  call void @anydsl_print_string([0 x i8]* bitcast ([2 x i8]* @_7972 to [0 x i8]*))
  br label %return

return:                                           ; preds = %anydsl_print_char_cont
  ret void

expr_true:                                        ; preds = %while_head
  %5 = load i32, i32* %b_7920, align 4
  %6 = getelementptr inbounds [9 x i8], [9 x i8]* @_7935, i64 0, i32 %5
  %7 = load i8, i8* %6
  %8 = add nsw i32 1, %5
  %9 = icmp eq i8 %7, 55
  store i32 %8, i32* %b_7920, align 4
  br i1 %9, label %expr_true2, label %expr_false1

expr_false1:                                      ; preds = %expr_true
  br label %body_cont

expr_true2:                                       ; preds = %expr_true
  store i32 %5, i32* %res_7949, align 4
  br label %body_cont

body_cont:                                        ; preds = %expr_true2, %expr_false1
  br label %while_head
}

declare void @anydsl_print_char(i8)

declare void @anydsl_print_string([0 x i8]*)
```


**Cuda_unspec (llvm):**

```

; ModuleID = 'jit'
source_filename = "jit"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@0 = private unnamed_addr constant [12 x i8] c"lambda_9908\00", align 1
@1 = private unnamed_addr constant [7 x i8] c"jit.cu\00", align 1
@_11497 = internal global [2 x i8] c"\0A\00"

define void @dummy([0 x i8]* %vals_11421) {
dummy_start:
  %0 = alloca [3 x i32]
  %1 = alloca [3 x i32]
  %anydsl_alloc_unified1 = alloca i8*
  %types = alloca [1 x i8]
  %aligns = alloca [1 x i32]
  %sizes = alloca [1 x i32]
  %args = alloca [1 x i8*]
  %res_11450 = alloca i32
  br label %dummy

dummy:                                            ; preds = %dummy_start
  %2 = call [0 x i8]* @anydsl_alloc_unified(i32 1, i64 4)
  br label %anydsl_alloc_unified_cont

anydsl_alloc_unified_cont:                        ; preds = %dummy
  %anydsl_alloc_unified = phi [0 x i8]* [ %2, %dummy ]
  %3 = bitcast [0 x i8]* %anydsl_alloc_unified to [1 x i32]*
  %4 = getelementptr inbounds [1 x i32], [1 x i32]* %3, i64 0, i32 0
  store i32 0, i32* %4, align 4
  store i32 0, i32* %res_11450, align 4
  %5 = bitcast [0 x i8]* %anydsl_alloc_unified to i8*
  store i8* %5, i8** %anydsl_alloc_unified1
  %6 = bitcast i8** %anydsl_alloc_unified1 to i8*
  %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %args, i32 0, i32 0
  %8 = getelementptr inbounds [1 x i32], [1 x i32]* %sizes, i32 0, i32 0
  %9 = getelementptr inbounds [1 x i32], [1 x i32]* %aligns, i32 0, i32 0
  %10 = getelementptr inbounds [1 x i8], [1 x i8]* %types, i32 0, i32 0
  store i8* %6, i8** %7
  store i32 8, i32* %8
  store i32 8, i32* %9
  store i8 1, i8* %10
  store [3 x i32] [i32 1, i32 1, i32 1], [3 x i32]* %1
  store [3 x i32] [i32 1, i32 1, i32 1], [3 x i32]* %0
  %11 = getelementptr inbounds [3 x i32], [3 x i32]* %1, i32 0, i32 0
  %12 = getelementptr inbounds [3 x i32], [3 x i32]* %0, i32 0, i32 0
  %13 = getelementptr inbounds [1 x i8*], [1 x i8*]* %args, i32 0, i32 0
  %14 = getelementptr inbounds [1 x i32], [1 x i32]* %sizes, i32 0, i32 0
  %15 = getelementptr inbounds [1 x i32], [1 x i32]* %aligns, i32 0, i32 0
  %16 = getelementptr inbounds [1 x i8], [1 x i8]* %types, i32 0, i32 0
  call void @anydsl_launch_kernel(i32 1, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @1, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8], [12 x i8]* @0, i32 0, i32 0), i32* %11, i32* %12, i8** %13, i32* %14, i32* %15, i8* %16, i32 1)
  br label %_

_:                                                ; preds = %anydsl_alloc_unified_cont
  call void @anydsl_synchronize(i32 1)
  br label %synchronize_cuda_cont

synchronize_cuda_cont:                            ; preds = %_
  %17 = load i32, i32* %4, align 4
  store i32 %17, i32* %res_11450, align 4
  call void @anydsl_release(i32 1, [0 x i8]* %anydsl_alloc_unified)
  br label %release_cont

release_cont:                                     ; preds = %synchronize_cuda_cont
  %18 = load i32, i32* %res_11450, align 4
  %19 = getelementptr inbounds [0 x i8], [0 x i8]* %vals_11421, i64 0, i32 %18
  %20 = load i8, i8* %19, align 1
  call void @anydsl_print_char(i8 %20)
  br label %anydsl_print_char_cont

anydsl_print_char_cont:                           ; preds = %release_cont
  call void @anydsl_print_string([0 x i8]* bitcast ([2 x i8]* @_11497 to [0 x i8]*))
  br label %return

return:                                           ; preds = %anydsl_print_char_cont
  ret void
}

declare [0 x i8]* @anydsl_alloc_unified(i32, i64)

declare void @anydsl_launch_kernel(i32, i8*, i8*, i32*, i32*, i8**, i32*, i32*, i8*, i32)

declare void @anydsl_synchronize(i32)

declare void @anydsl_release(i32, [0 x i8]*)

declare void @anydsl_print_char(i8)

declare void @anydsl_print_string([0 x i8]*)


```

**.cu:**
```
extern "C" {
typedef struct {
    int e[1];
} array_5547;
typedef struct {
    unsigned char e[9];
} array_5543;

__device__ inline int threadIdx_x() { return threadIdx.x; }
__device__ inline int threadIdx_y() { return threadIdx.y; }
__device__ inline int threadIdx_z() { return threadIdx.z; }
__device__ inline int blockIdx_x() { return blockIdx.x; }
__device__ inline int blockIdx_y() { return blockIdx.y; }
__device__ inline int blockIdx_z() { return blockIdx.z; }
__device__ inline int blockDim_x() { return blockDim.x; }
__device__ inline int blockDim_y() { return blockDim.y; }
__device__ inline int blockDim_z() { return blockDim.z; }
__device__ inline int gridDim_x() { return gridDim.x; }
__device__ inline int gridDim_y() { return gridDim.y; }
__device__ inline int gridDim_z() { return gridDim.z; }
__global__ void lambda_9908(char*);

__global__ __launch_bounds__ (1 * 1 * 1) void lambda_9908(char* _9911_10879) {
    array_5547* buf__10914;
    buf__10914 = (array_5547*)_9911_10879;
    
    int* _10916;
    _10916 = &buf__10914->e[0];
    int b_10885_slot;
    int* b_10885;
    b_10885 = &b_10885_slot;
    *b_10885 = 0;
    goto l10880;
    l10880: ;
        int _10886;
        _10886 = *b_10885;
        int _10887;
        _10887 = _10886;
        bool _10889;
        _10889 = _10887 < 9;
        if (_10889) goto l10890; else goto l10919;
    l10919: ;
        return ;
    l10890: ;
        int _10903;
        _10903 = *b_10885;
        int _10904;
        _10904 = _10903;
        array_5543 _10900_3;
        {
        array_5543 _10900_3_tmp = { { 49, 50, 51, 52, 53, 54, 55, 56, 0, } };
         _10900_3 = _10900_3_tmp;
        }
        
        unsigned char _10905;
        _10905 = _10900_3.e[_10904];
        int _10912;
        _10912 = 1 + _10904;
        bool _10906;
        _10906 = _10905 == 55;
        *b_10885 = _10912;
        if (_10906) goto l10907; else goto l10918;
    l10918: ;
        goto l10908;
    l10907: ;
        *_10916 = _10904;
        goto l10908;
    l10908: ;
        goto l10880;
}

}
```