VowpalWabbit · JohnLangford · Jun 5, 2019 · Mar 2, 2019 · Mar 2, 2019 · Mar 2, 2019
diff --git a/demo/memory_tree/README.md b/demo/memory_tree/README.md
@@ -0,0 +1,34 @@
+Contextual Memory Tree (CMT)
+===============================
+
+This demo exercises CMT for applications of logarithmic time 
+multiclass classification (online and offline), and logarithmic time multilabel classification.
+
+
+The datasets for multiclass classification used are [ALOI](http://aloi.science.uva.nl/) and WikiPara. ALOI
+has 1000 classes, and each class has in average 100 training examples. WikiPara
+contains 10000 classes. We consider two versions of WikiPara here: 1-shot version which 
+contains 1 training example per class, and 2-shot version which contains 2 training examples per class. 
+
+The datasets for multilabel classification used are RCV1-2K, AmazonCat-13K, and Wiki10-31K from the XML [repository](http://manikvarma.org/downloads/XC/XMLRepository.html).
+
+We refer users to the [manuscript](https://arxiv.org/pdf/1807.06473.pdf) for detailed datastrutures and algorithms in CMT
+
+## Dependency:
+python 3
+
+## Training Online Contextual Memory Tree on ALOI and WikiPara:
+```bash
+python aloi_script_progerror.py
+python wikipara10000_script_progerror.py
+```
+
+## Training Offline Contextual Memory Tree on ALOI, WikiPara, RCV1-2K, AmazonCat-13K and Wiki10-31K:
+```bash
+python aloi_script.py
+python wikipara10000_script.py
+python xml_rcv1x.script.py
+python xml_amazoncat_13K_script.py
+python xml_wiki10.script.py
+```
+
diff --git a/demo/memory_tree/aloi_script.py b/demo/memory_tree/aloi_script.py
@@ -0,0 +1,56 @@
+import os
+import time
+import numpy as np
+
+
+#for shot in available_shots.iterkeys():
+print("## perform experiments on aloi ##")
+num_of_classes = 1000
+leaf_example_multiplier = 4 #8
+shots = 100
+lr = 0.001
+bits = 29
+alpha = 0.1 #0.3
+passes =  3 #3 #5
+use_oas = 0
+dream_at_update = 0
+learn_at_leaf = 1 #turn on leaf at leaf actually works better
+num_queries =  5 #int(np.log(passes*num_of_classes*shots))
+loss = "squared"
+dream_repeats = 3
+online = 0
+
+tree_node = int(2*passes*(num_of_classes*shots/(np.log(num_of_classes*shots)/np.log(2)*leaf_example_multiplier)));
+
+train_data = "aloi_train.vw"
+test_data = "aloi_test.vw"
+if os.path.exists(train_data) is not True:
+        os.system("wget http://kalman.ml.cmu.edu/wen_datasets/{}".format(train_data))
+if os.path.exists(test_data) is not True:
+        os.system("wget http://kalman.ml.cmu.edu/wen_datasets/{}".format(test_data))
+
+
+saved_model = "{}.vw".format(train_data)
+
+print("## Training...")
+start = time.time()
+command_train = "../../build/vowpalwabbit/vw {} --memory_tree {} --learn_at_leaf {} --max_number_of_labels {} --dream_at_update {} --dream_repeats {} --oas {} --online {} --leaf_example_multiplier {} --Alpha {} -l {} -b {} -c --passes {} --loss_function {} --holdout_off -f {}".format(
+                train_data, tree_node, learn_at_leaf, num_of_classes, dream_at_update,
+                dream_repeats, use_oas, online, leaf_example_multiplier, alpha, lr, bits, passes, loss, saved_model)
+print(command_train)
+os.system(command_train)
+train_time = time.time() - start
+
+#test:
+print("## Testing...")
+start = time.time();
+os.system("../../build/vowpalwabbit/vw {} -i {}".format(test_data, saved_model))
+
+test_time = time.time() - start
+
+print("## train time {}, and test time {}".format(train_time, test_time))
+
+
+
+
+
diff --git a/demo/memory_tree/aloi_script_progerror.py b/demo/memory_tree/aloi_script_progerror.py
@@ -0,0 +1,56 @@
+import os
+import time
+import numpy as np
+#from IPython import embed
+
+
+#for shot in available_shots.iterkeys():
+print("## perform experiments on aloi ##")
+num_of_classes = 1000
+leaf_example_multiplier = 10
+shots = 100
+lr = 0.001
+bits = 29
+alpha = 0.1 #0.3
+passes = 1  #3 #5
+use_oas = 0
+dream_at_update = 0
+learn_at_leaf = 1 #turn on leaf at leaf actually works better
+loss = "squared"
+dream_repeats = 20 #3
+online = 1
+
+tree_node = int(2*passes*(num_of_classes*shots/(np.log(num_of_classes*shots)/np.log(2)*leaf_example_multiplier)));
+
+train_data = "aloi_train.vw"
+test_data = "aloi_test.vw"
+if os.path.exists(train_data) is not True:
+        os.system("wget http://kalman.ml.cmu.edu/wen_datasets/{}".format(train_data))
+if os.path.exists(test_data) is not True:
+        os.system("wget http://kalman.ml.cmu.edu/wen_datasets/{}".format(test_data))
+
+
+saved_model = "{}.vw".format(train_data)
+
+print("## Training...")
+start = time.time()
+os.system("../../build/vowpalwabbit/vw {} --memory_tree {} --learn_at_leaf {} --max_number_of_labels {} --dream_at_update {}\
+                   --dream_repeats {} --oas {} --online {}\
+                --leaf_example_multiplier {} --Alpha {} -l {} -b {} -c --passes {} --loss_function {} --holdout_off -f {}".format(
+                train_data, tree_node, learn_at_leaf, num_of_classes, dream_at_update,
+                dream_repeats, use_oas, online, leaf_example_multiplier, alpha, lr, bits, passes, loss, saved_model))
+train_time = time.time() - start
+
+    #test:
+#print "## Testing..."
+#start = time.time();
+#os.system(".././vw {} -i {}".format(test_data, saved_model))
+
+#test_time = time.time() - start
+
+print("## train time {}, and test time {}".format(train_time, test_time))
+
+
+
+
+
diff --git a/demo/memory_tree/wikipara10000_script.py b/demo/memory_tree/wikipara10000_script.py
@@ -0,0 +1,62 @@
+import os
+import time
+import numpy as np
+#from IPython import embed
+
+
+available_shots = {'three':3, "one":1}
+#available_shots = {'three':3}
+
+for shot,shots in available_shots.items():
+    print("## perform experiments on {}-shot wikipara-10K ##".format(shot))
+    #shots = available_shots[shot]
+    num_of_classes = 10000
+    leaf_example_multiplier = 4 #2
+    lr = 0.1
+    bits = 29#30
+    passes = 2 #1
+    #hal_version = 1
+    #num_queries = 1 #int(np.log(shots*num_of_classes)/np.log(2.))
+    alpha = 0.1
+    learn_at_leaf = 1
+    use_oas = 0
+    dream_at_update = 1
+    dream_repeats = 5
+    loss = "squared"
+    online = 0
+
+    tree_node = int(2*passes*(num_of_classes*shots/(np.log(num_of_classes*shots)/np.log(2)*leaf_example_multiplier)));
+
+    train_data = "paradata10000_{}_shot.vw.train".format(shot)
+    test_data = "paradata10000_{}_shot.vw.test".format(shot)
+    if os.path.exists(train_data) is not True:
+        os.system("wget http://kalman.ml.cmu.edu/wen_datasets/{}".format(train_data))
+    if os.path.exists(test_data) is not True:
+        os.system("wget http://kalman.ml.cmu.edu/wen_datasets/{}".format(test_data))
+
+    saved_model = "{}.vw".format(train_data)
+
+    print("## Training...")
+    start = time.time()
+    os.system("../../build/vowpalwabbit/vw {} --memory_tree {} --learn_at_leaf {} --max_number_of_labels {} --oas {} --online {} --dream_at_update {}\
+              --leaf_example_multiplier {} --dream_repeats {} \
+        --Alpha {} -l {} -b {} -c --passes {} --loss_function {} --holdout_off -f {}".format(
+                train_data,
+                tree_node, learn_at_leaf, num_of_classes,  use_oas, online, dream_at_update, 
+                leaf_example_multiplier, dream_repeats, alpha, lr, bits, passes, loss, saved_model))
+    train_time = time.time() - start
+
+    #test:
+    print("## Testing...")
+    start = time.time();
+    os.system("../../build/vowpalwabbit/vw {} -i {}".format(test_data, saved_model))
+
+    test_time = time.time() - start
+
+
+    print("## train time {}, and test time {}".format(train_time, test_time))
+
+
+
+
+
diff --git a/demo/memory_tree/wikipara10000_script_progerror.py b/demo/memory_tree/wikipara10000_script_progerror.py
@@ -0,0 +1,60 @@
+import os
+import time
+import numpy as np
+
+
+#available_shots = {'three':3, "one":1}
+available_shots = {'three':3}
+
+for shot,shots in available_shots.items():
+    print("## perform experiments on {}-shot wikipara-10K ##".format(shot))
+    #shots = available_shots[shot]
+    num_of_classes = 10000
+    leaf_example_multiplier = 10 #2
+    lr = 0.1
+    bits = 29#30
+    passes =1# 2
+    #hal_version = 1
+    #num_queries = 1 #int(np.log(shots*num_of_classes)/np.log(2.))
+    alpha = 0.1
+    learn_at_leaf = 0
+    use_oas = 0
+    dream_at_update = 1
+    dream_repeats = 15
+    loss = "squared"
+    online = 1
+
+    tree_node = int(2*passes*(num_of_classes*shots/(np.log(num_of_classes*shots)/np.log(2)*leaf_example_multiplier)));
+
+    train_data = "paradata10000_{}_shot.vw.train".format(shot)
+    test_data = "paradata10000_{}_shot.vw.test".format(shot)
+    if os.path.exists(train_data) is not True:
+        os.system("wget http://kalman.ml.cmu.edu/wen_datasets/{}".format(train_data))
+    if os.path.exists(test_data) is not True:
+        os.system("wget http://kalman.ml.cmu.edu/wen_datasets/{}".format(test_data))
+
+    saved_model = "{}.vw".format(train_data)
+
+    print("## Training...")
+    start = time.time()
+    os.system("../../build/vowpalwabbit/vw {} --memory_tree {} --learn_at_leaf {} --max_number_of_labels {} --oas {} --online {} --dream_at_update {}\
+              --leaf_example_multiplier {} --dream_repeats {} \
+        --Alpha {} -l {} -b {} -c --passes {} --loss_function {} --holdout_off -f {}".format(
+                train_data, tree_node, learn_at_leaf, num_of_classes, use_oas, online, dream_at_update,
+                leaf_example_multiplier,  dream_repeats, alpha, lr, bits, passes, loss, saved_model))
+    train_time = time.time() - start
+
+    #test:
+    #print "## Testing..."
+    #start = time.time();
+    #os.system(".././vw {} -i {}".format(test_data, saved_model))
+
+    #test_time = time.time() - start
+
+
+    #print "## train time {}, and test time {}".format(train_time, test_time)
+
+
+
+
+
diff --git a/demo/memory_tree/xml_amazoncat_13K_script.py b/demo/memory_tree/xml_amazoncat_13K_script.py
@@ -0,0 +1,54 @@
+import os
+import time
+import numpy as np
+#from IPython import embed
+
+print("perform experiments on amazoncat 13K (multilabel)")
+leaf_example_multiplier = 2
+lr = 1
+bits = 30
+alpha = 0.1 #0.3
+passes = 4
+learn_at_leaf = 1
+use_oas = 1
+#num_queries = 1  #does not really use
+dream_at_update = 1
+#hal_version = 1 #does not really use
+loss = "squared"
+dream_repeats = 3
+#Precision_at_K = 5
+
+num_examples = 1186239
+max_num_labels = 13330
+
+tree_node = int(num_examples/(np.log(num_examples)/np.log(2)*leaf_example_multiplier))
+train_data = "amazoncat_train.mat.mult_label.vw.txt"
+test_data =  "amazoncat_test.mat.mult_label.vw.txt"
+
+if os.path.exists(train_data) is not True:
+        os.system("wget http://kalman.ml.cmu.edu/wen_datasets/{}".format(train_data))
+if os.path.exists(test_data) is not True:
+        os.system("wget http://kalman.ml.cmu.edu/wen_datasets/{}".format(test_data))
+
+saved_model = "{}.vw".format(train_data)
+
+print("## Training...")
+start = time.time()
+#train_data = 'tmp_rcv1x.vw.txt'
+os.system("../../build/vowpalwabbit/vw {} --memory_tree {} --learn_at_leaf {} --dream_at_update {}\
+          --max_number_of_labels {} --dream_repeats {} --oas {} \
+          --leaf_example_multiplier {} --Alpha {} -l {} -b {} -c --passes {} --loss_function {} --holdout_off -f {}".format(
+                train_data, tree_node, learn_at_leaf, dream_at_update,
+                max_num_labels, dream_repeats, use_oas,
+                leaf_example_multiplier,
+                alpha, lr, bits,
+                passes, loss,
+                saved_model))
+train_time = time.time() - start
+
+print("## Testing...")
+start = time.time()
+os.system("../../build/vowpalwabbit/vw {} --oas {} -i {}".format(test_data,use_oas, saved_model))
+test_time = time.time() - start
+print("## train time {}, and test time {}".format(train_time, test_time))
+
diff --git a/demo/memory_tree/xml_rcv1x.script.py b/demo/memory_tree/xml_rcv1x.script.py
@@ -0,0 +1,53 @@
+import os
+import time
+import numpy as np
+#from IPython import embed
+
+print("perform experiments on rcv1x (multilabel)")
+leaf_example_multiplier = 2
+lr = 0.1
+bits = 30
+alpha = 0.1
+passes = 6 #4
+learn_at_leaf = 1
+use_oas = 1
+dream_at_update =0 # 1
+#num_queries = 1  #does not really use
+#hal_version = 1 #does not really use
+loss = "squared"
+dream_repeats = 3
+#Precision_at_K = 5
+
+num_examples = 630000
+max_num_labels = 2456
+
+tree_node = int(num_examples/(np.log(num_examples)/np.log(2)*leaf_example_multiplier))
+train_data = "rcv1x_train.mat.mult_label.vw.txt"
+test_data = "rcv1x_test.mat.mult_label.vw.txt"
+if os.path.exists(train_data) is not True:
+        os.system("wget http://kalman.ml.cmu.edu/wen_datasets/{}".format(train_data))
+if os.path.exists(test_data) is not True:
+        os.system("wget http://kalman.ml.cmu.edu/wen_datasets/{}".format(test_data))
+
+saved_model = "{}.vw".format(train_data)
+
+print("## Training...")
+start = time.time()
+#train_data = 'tmp_rcv1.vw.txt'
+os.system("../../build/vowpalwabbit/vw {} --memory_tree {} --learn_at_leaf {} --dream_at_update {}\
+                --max_number_of_labels {} --dream_repeats {} --oas {} \
+                --leaf_example_multiplier {} --Alpha {} -l {} -b {} -c --passes {} --loss_function {} -f {}".format(
+                train_data, tree_node, learn_at_leaf, dream_at_update,
+                max_num_labels, dream_repeats,use_oas,
+                leaf_example_multiplier,
+                alpha, lr, bits,
+                passes, loss,
+                saved_model))
+train_time = time.time() - start
+
+print("## Testing...")
+start = time.time()
+os.system("../../build/vowpalwabbit/vw {} --oas {} -i {}".format(test_data, use_oas, saved_model))
+test_time = time.time() - start
+print("## train time {}, and test time {}".format(train_time, test_time))
+