re-evaluate models with jsquad prompt with title

Stability-AI · Sep 17, 2023 · 7e9e1ab · 7e9e1ab
1 parent 11d8c89
commit 7e9e1ab
Show file tree

Hide file tree

Showing 69 changed files with 470 additions and 114 deletions.
diff --git a/models/abeja-gpt-neox-japanese-2.7b/harness.jsquad-1.2.sh b/models/abeja-gpt-neox-japanese-2.7b/harness.jsquad-1.2.sh
@@ -0,0 +1,3 @@
+MODEL_ARGS="pretrained=abeja/gpt-neox-japanese-2.7b,device_map=auto,torch_dtype=auto"
+TASK="jsquad-1.2-0.2"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3" --device "cuda" --output_path "models/abeja-gpt-neox-japanese-2.7b/result.jsquad-1.2.json"
diff --git a/models/abeja-gpt-neox-japanese-2.7b/harness.sh b/models/abeja-gpt-neox-japanese-2.7b/harness.sh
@@ -1,3 +1,3 @@
 MODEL_ARGS="pretrained=abeja/gpt-neox-japanese-2.7b"
-TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,xlsum_ja"
+TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.2-0.2,xlsum_ja"
 python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1" --device "cuda" --output_path "models/abeja-gpt-neox-japanese-2.7b/result.json"
diff --git a/models/abeja-gpt-neox-japanese-2.7b/result.json b/models/abeja-gpt-neox-japanese-2.7b/result.json
@@ -18,9 +18,9 @@
       "acc_norm": 0.749912800837112,
       "acc_norm_stderr": 0.005719527388015089
     },
-    "jsquad-1.1-0.2": {
-      "exact_match": 13.665015758667266,
-      "f1": 22.909453892411364
+    "jsquad-1.2-0.2": {
+      "exact_match": 15.803692030616839,
+      "f1": 25.18326978234071
     },
     "xlsum_ja": {
       "rouge2": 6.149952794206885
@@ -33,7 +33,7 @@
   "versions": {
     "jcommonsenseqa-1.1-0.2": 1.1,
     "jnli-1.1-0.2": 1.1,
-    "jsquad-1.1-0.2": 1.1,
+    "jsquad-1.2-0.2": 1.2,
     "marc_ja-1.1-0.2": 1.1,
     "xlsum_ja": 1.0,
     "xwinograd_ja": 1.0

diff --git a/models/abeja-gpt-neox-japanese-2.7b/result.jsquad-1.2.json b/models/abeja-gpt-neox-japanese-2.7b/result.jsquad-1.2.json
@@ -0,0 +1,22 @@
+{
+  "results": {
+    "jsquad-1.2-0.2": {
+      "exact_match": 15.803692030616839,
+      "f1": 25.18326978234071
+    }
+  },
+  "versions": {
+    "jsquad-1.2-0.2": 1.2
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=abeja/gpt-neox-japanese-2.7b,device_map=auto,torch_dtype=auto",
+    "num_fewshot": 3,
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
diff --git a/models/cyberagent/cyberagent-open-calm-1b/harness.jsquad-1.2.sh b/models/cyberagent/cyberagent-open-calm-1b/harness.jsquad-1.2.sh
@@ -0,0 +1,3 @@
+MODEL_ARGS="pretrained=cyberagent/open-calm-1b,device_map=auto,torch_dtype=auto"
+TASK="jsquad-1.2-0.2"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3" --device "cuda" --output_path "models/cyberagent-open-calm-1b/result.jsquad-1.2.json"
diff --git a/models/cyberagent/cyberagent-open-calm-1b/harness.sh b/models/cyberagent/cyberagent-open-calm-1b/harness.sh
@@ -1,3 +1,3 @@
 MODEL_ARGS="pretrained=cyberagent/open-calm-1b"
-TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,xlsum_ja"
+TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.2-0.2,xlsum_ja"
 python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1" --device "cuda" --output_path "models/cyberagent-open-calm-1b/result.json"
diff --git a/models/cyberagent/cyberagent-open-calm-1b/result.json b/models/cyberagent/cyberagent-open-calm-1b/result.json
@@ -18,9 +18,9 @@
       "acc_norm": 0.7792117195674921,
       "acc_norm_stderr": 0.005478034657719626
     },
-    "jsquad-1.1-0.2": {
-      "exact_match": 37.12291760468258,
-      "f1": 47.171446643186265
+    "jsquad-1.2-0.2": {
+      "exact_match": 39.53174245835209,
+      "f1": 49.49399460234075
     },
     "xlsum_ja": {
       "rouge2": 2.288077088085482
@@ -33,7 +33,7 @@
   "versions": {
     "jcommonsenseqa-1.1-0.2": 1.1,
     "jnli-1.1-0.2": 1.1,
-    "jsquad-1.1-0.2": 1.1,
+    "jsquad-1.2-0.2": 1.2,
     "marc_ja-1.1-0.2": 1.1,
     "xlsum_ja": 1.0,
     "xwinograd_ja": 1.0

diff --git a/models/cyberagent/cyberagent-open-calm-1b/result.jsquad-1.2.json b/models/cyberagent/cyberagent-open-calm-1b/result.jsquad-1.2.json
@@ -0,0 +1,22 @@
+{
+  "results": {
+    "jsquad-1.2-0.2": {
+      "exact_match": 39.53174245835209,
+      "f1": 49.49399460234075
+    }
+  },
+  "versions": {
+    "jsquad-1.2-0.2": 1.2
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=cyberagent/open-calm-1b",
+    "num_fewshot": 3,
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
diff --git a/models/cyberagent/cyberagent-open-calm-3b/harness.jsquad-1.2.sh b/models/cyberagent/cyberagent-open-calm-3b/harness.jsquad-1.2.sh
@@ -0,0 +1,3 @@
+MODEL_ARGS="pretrained=cyberagent/open-calm-3b,device_map=auto,torch_dtype=auto"
+TASK="jsquad-1.2-0.2"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/cyberagent/cyberagent-open-calm-3b/result.jsquad-1.2.json"
diff --git a/models/cyberagent/cyberagent-open-calm-3b/harness.sh b/models/cyberagent/cyberagent-open-calm-3b/harness.sh
@@ -1,3 +1,3 @@
 MODEL_ARGS="pretrained=cyberagent/open-calm-3b"
-TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,jaqket_v2-0.1-0.2,xlsum_ja,xwinograd_ja,mgsm"
+TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.2-0.2,jaqket_v2-0.1-0.2,xlsum_ja,xwinograd_ja,mgsm"
 python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3,3,3,2,1,1,0,5" --device "cuda" --output_path "models/cyberagent/cyberagent-open-calm-3b/result.json"
diff --git a/models/cyberagent/cyberagent-open-calm-3b/result.json b/models/cyberagent/cyberagent-open-calm-3b/result.json
@@ -22,9 +22,9 @@
       "acc": 0.6360792492179353,
       "acc_stderr": 0.015544482535576241
     },
-    "jsquad-1.1-0.2": {
-      "exact_match": 40.45475011256191,
-      "f1": 52.73709875917724
+    "jsquad-1.2-0.2": {
+      "exact_match": 44.529491220171096,
+      "f1": 56.02141036867636
     },
     "jaqket_v2-0.1-0.2": {
       "exact_match": 46.90721649484536,
@@ -42,7 +42,7 @@
     "jcommonsenseqa-1.1-0.2": 1.1,
     "jnli-1.1-0.2": 1.1,
     "marc_ja-1.1-0.2": 1.1,
-    "jsquad-1.1-0.2": 1.1,
+    "jsquad-1.2-0.2": 1.2,
     "jaqket_v2-0.1-0.2": 0.1,
     "xlsum_ja": 1.0,
     "xwinograd_ja": 1.0,

diff --git a/models/cyberagent/cyberagent-open-calm-3b/result.jsquad-1.2.json b/models/cyberagent/cyberagent-open-calm-3b/result.jsquad-1.2.json
@@ -0,0 +1,22 @@
+{
+  "results": {
+    "jsquad-1.2-0.2": {
+      "exact_match": 44.529491220171096,
+      "f1": 56.02141036867636
+    }
+  },
+  "versions": {
+    "jsquad-1.2-0.2": 1.2
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=cyberagent/open-calm-3b,device_map=auto,torch_dtype=auto",
+    "num_fewshot": 2,
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
diff --git a/models/cyberagent/cyberagent-open-calm-7b/harness.jsquad-1.2.sh b/models/cyberagent/cyberagent-open-calm-7b/harness.jsquad-1.2.sh
@@ -0,0 +1,3 @@
+MODEL_ARGS="pretrained=cyberagent/open-calm-7b,device_map=auto,torch_dtype=auto"
+TASK="jsquad-1.2-0.2"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/cyberagent/cyberagent-open-calm-7b/result.jsquad-1.2.json"
diff --git a/models/cyberagent/cyberagent-open-calm-7b/harness.sh b/models/cyberagent/cyberagent-open-calm-7b/harness.sh
@@ -1,3 +1,3 @@
 MODEL_ARGS="pretrained=cyberagent/open-calm-7b"
-TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,jaqket_v2-0.1-0.2,xlsum_ja,xwinograd_ja,mgsm"
+TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.2-0.2,jaqket_v2-0.1-0.2,xlsum_ja,xwinograd_ja,mgsm"
 python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3,3,3,2,1,1,0,5" --device "cuda" --output_path "models/cyberagent/cyberagent-open-calm-7b/result.json"
diff --git a/models/cyberagent/cyberagent-open-calm-7b/result.json b/models/cyberagent/cyberagent-open-calm-7b/result.json
@@ -22,9 +22,9 @@
       "acc": 0.6506777893639207,
       "acc_stderr": 0.01540328448938605
     },
-    "jsquad-1.1-0.2": {
-      "exact_match": 45.79018460153084,
-      "f1": 59.03158509144496
+    "jsquad-1.2-0.2": {
+      "exact_match": 48.10895992796038,
+      "f1": 60.90961937230767
     },
     "jaqket_v2-0.1-0.2": {
       "exact_match": 60.738831615120276,
@@ -42,7 +42,7 @@
     "jcommonsenseqa-1.1-0.2": 1.1,
     "jnli-1.1-0.2": 1.1,
     "marc_ja-1.1-0.2": 1.1,
-    "jsquad-1.1-0.2": 1.1,
+    "jsquad-1.2-0.2": 1.2,
     "jaqket_v2-0.1-0.2": 0.1,
     "xlsum_ja": 1.0,
     "xwinograd_ja": 1.0,

diff --git a/models/cyberagent/cyberagent-open-calm-7b/result.jsquad-1.2.json b/models/cyberagent/cyberagent-open-calm-7b/result.jsquad-1.2.json
@@ -0,0 +1,22 @@
+{
+  "results": {
+    "jsquad-1.2-0.2": {
+      "exact_match": 48.10895992796038,
+      "f1": 60.90961937230767
+    }
+  },
+  "versions": {
+    "jsquad-1.2-0.2": 1.2
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=cyberagent/open-calm-7b,device_map=auto,torch_dtype=auto",
+    "num_fewshot": 2,
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
diff --git a/models/cyberagent/cyberagent-open-calm-large/harness.jsquad-1.2.sh b/models/cyberagent/cyberagent-open-calm-large/harness.jsquad-1.2.sh
@@ -0,0 +1,3 @@
+MODEL_ARGS="pretrained=cyberagent/open-calm-large,use_fast=True,device_map=auto,torch_dtype=auto"
+TASK="jsquad-1.2-0.2"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3" --device "cuda" --output_path "models/cyberagent-open-calm-large/result.jsquad-1.2.json"
diff --git a/models/cyberagent/cyberagent-open-calm-large/result.json b/models/cyberagent/cyberagent-open-calm-large/result.json
@@ -18,9 +18,9 @@
       "acc_norm": 0.7912452040460412,
       "acc_norm_stderr": 0.005367632889806105
     },
-    "jsquad-1.1-0.2": {
-      "exact_match": 37.23547951373255,
-      "f1": 48.50349592141573
+    "jsquad-1.2-0.2": {
+      "exact_match": 40.4997748761819,
+      "f1": 51.32160467436942
     },
     "xlsum_ja": {
       "rouge2": 1.9854375467671679
@@ -33,7 +33,7 @@
   "versions": {
     "jcommonsenseqa-1.1-0.2": 1.1,
     "jnli-1.1-0.2": 1.1,
-    "jsquad-1.1-0.2": 1.1,
+    "jsquad-1.2-0.2": 1.2,
     "marc_ja-1.1-0.2": 1.1,
     "xlsum_ja": 1.0,
     "xwinograd_ja": 1.0

diff --git a/models/cyberagent/cyberagent-open-calm-large/result.jsquad-1.2.json b/models/cyberagent/cyberagent-open-calm-large/result.jsquad-1.2.json
@@ -0,0 +1,22 @@
+{
+  "results": {
+    "jsquad-1.2-0.2": {
+      "exact_match": 40.4997748761819,
+      "f1": 51.32160467436942
+    }
+  },
+  "versions": {
+    "jsquad-1.2-0.2": 1.2
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=cyberagent/open-calm-large,use_fast=True,device_map=auto,torch_dtype=auto",
+    "num_fewshot": 3,
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
diff --git a/models/cyberagent/cyberagent-open-calm-medium/harness.jsquad-1.2.sh b/models/cyberagent/cyberagent-open-calm-medium/harness.jsquad-1.2.sh
@@ -0,0 +1,3 @@
+MODEL_ARGS="pretrained=cyberagent/open-calm-medium,use_fast=True,device_map=auto,torch_dtype=auto"
+TASK="jsquad-1.2-0.2"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3" --device "cuda" --output_path "models/cyberagent-open-calm-medium/result.jsquad-1.2.json"
diff --git a/models/cyberagent/cyberagent-open-calm-medium/result.json b/models/cyberagent/cyberagent-open-calm-medium/result.json
@@ -18,9 +18,9 @@
       "acc_norm": 0.8357167771189397,
       "acc_norm_stderr": 0.004893675823612713
     },
-    "jsquad-1.1-0.2": {
-      "exact_match": 28.725799189554255,
-      "f1": 39.80333448254385
+    "jsquad-1.2-0.2": {
+      "exact_match": 29.85141828005403,
+      "f1": 40.49655778214922
     },
     "xlsum_ja": {
       "rouge2": 2.5775988917922406
@@ -33,7 +33,7 @@
   "versions": {
     "jcommonsenseqa-1.1-0.2": 1.1,
     "jnli-1.1-0.2": 1.1,
-    "jsquad-1.1-0.2": 1.1,
+    "jsquad-1.2-0.2": 1.2,
     "marc_ja-1.1-0.2": 1.1,
     "xlsum_ja": 1.0,
     "xwinograd_ja": 1.0

diff --git a/models/cyberagent/cyberagent-open-calm-medium/result.jsquad-1.2.json b/models/cyberagent/cyberagent-open-calm-medium/result.jsquad-1.2.json
@@ -0,0 +1,22 @@
+{
+  "results": {
+    "jsquad-1.2-0.2": {
+      "exact_match": 29.85141828005403,
+      "f1": 40.49655778214922
+    }
+  },
+  "versions": {
+    "jsquad-1.2-0.2": 1.2
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=cyberagent/open-calm-medium,use_fast=True,device_map=auto,torch_dtype=auto",
+    "num_fewshot": 3,
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
diff --git a/models/llama/llama-7b/harness.jsquad-1.2.sh b/models/llama/llama-7b/harness.jsquad-1.2.sh
@@ -0,0 +1,4 @@
+MODEL_ARGS="pretrained=huggyllama/llama-7b,use_accelerate=True,load_in_8bit=True"
+TASK="jsquad-1.2-0.3"
+python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/llama/llama-7b/result.jsquad-1.2.json" --batch_size 2
+
diff --git a/models/llama/llama-7b/harness.sh b/models/llama/llama-7b/harness.sh
@@ -1,3 +1,3 @@
 MODEL_ARGS="pretrained=huggyllama/llama-7b,use_accelerate=True,load_in_8bit=True"
-TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3"
+TASK="jsquad-1.2-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3"
 python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3" --device "cuda" --output_path "models/llama/llama-7b/result.json" --batch_size 2  > models/llama/llama-7b/harness.out 2> models/llama/llama-7b/harness.err 
diff --git a/models/llama/llama-7b/result.json b/models/llama/llama-7b/result.json
@@ -1,8 +1,8 @@
 {
   "results": {
-    "jsquad-1.1-0.3": {
-      "exact_match": 34.46645655110311,
-      "f1": 50.01682040381688
+    "jsquad-1.2-0.3": {
+      "exact_match": 36.24493471409275,
+      "f1": 50.91625240527312
     },
     "jcommonsenseqa-1.1-0.3": {
       "acc": 0.38337801608579086,
@@ -24,7 +24,7 @@
     }
   },
   "versions": {
-    "jsquad-1.1-0.3": 1.1,
+    "jsquad-1.2-0.3": 1.2,
     "jcommonsenseqa-1.1-0.3": 1.1,
     "jnli-1.1-0.3": 1.1,
     "marc_ja-1.1-0.3": 1.1

diff --git a/models/llama/llama-7b/result.jsquad-1.2.json b/models/llama/llama-7b/result.jsquad-1.2.json
@@ -0,0 +1,22 @@
+{
+  "results": {
+    "jsquad-1.2-0.3": {
+      "exact_match": 36.24493471409275,
+      "f1": 50.91625240527312
+    }
+  },
+  "versions": {
+    "jsquad-1.2-0.3": 1.2
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=huggyllama/llama-7b,use_accelerate=True,load_in_8bit=True",
+    "num_fewshot": 2,
+    "batch_size": 2,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
diff --git a/...a2/llama2-2.7b/harness.jsquad-1.1-0.31.sh → .../llama2/llama2-2.7b/harness.jsquad-1.2.sh b/...a2/llama2-2.7b/harness.jsquad-1.1-0.31.sh → .../llama2/llama2-2.7b/harness.jsquad-1.2.sh
@@ -1,4 +1,4 @@
 MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,dtype=auto"
-TASK="jsquad-1.1-0.31"
-python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/llama2/llama2-2.7b/result.jsquad-1.1-0.31.json" --batch_size 2
+TASK="jsquad-1.2-0.3"
+python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/llama2/llama2-2.7b/result.jsquad-1.2.json" --batch_size 2
 
diff --git a/models/llama2/llama2-2.7b/harness.sh b/models/llama2/llama2-2.7b/harness.sh
@@ -1,5 +1,5 @@
 MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True"
-TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3"
+TASK="jsquad-1.2-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3"
 python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3" --device "cuda" --output_path "models/llama2/llama2-2.7b/result.json" --batch_size 2  > models/llama2/llama2-2.7b/harness.out 2> models/llama2/llama2-2.7b/harness.err