Refactor bw file selection into separate class

Also refactor to use file sizes in bytes and compare the length of the request rather than chosen file name.
TheTorProject · Jan 25, 2018 · 98aa946 · 98aa946
1 parent 943e003
commit 98aa946
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 33 deletions.
diff --git a/bwscanner/measurement.py b/bwscanner/measurement.py
@@ -1,3 +1,4 @@
+import random
 import time
 
 from stem.descriptor.server_descriptor import ServerDescriptor
@@ -14,11 +15,48 @@
 
 # defer.setDebugging(True)
 
+M = 1024*1024
 
 class DownloadIncomplete(Exception):
     pass
 
 
+class BwFiles(object):
+    def __init__(self):
+        self.urls = []
+        self.bw_files = {
+            64*M: ("64M", "913b3c5df256d62235f955fa936e7a4e2d5e0cb6"),
+            32*M: ("32M", "a536076ef51c2cfff607fec2d362671e031d6b48"),
+            16*M: ("16M", "e91690ed2abf05e347b61aafaa23abf2a2b3292f"),
+            8*M: ("8M", "c690229b300945ec4ba872b80e8c443e2e1750f0"),
+            4*M: ("4M", "94f7bc6679a4419b080debd70166c2e43e80533d"),
+            2*M: ("2M", "9793cc92932598898d22497acdd5d732037b1a13"),
+        }
+
+    def add_url(self, base_url):
+        if base_url not in self.urls:
+            self.urls.append(base_url)
+
+    def get_base_url(self):
+        return random.choice(self.urls)
+
+    def choose_file_size(self, path):
+        """
+        Choose bandwidth file based on average bandwidth of relays on
+        circuit.
+
+        XXX: Should we just use the bandwidth of the measured relay instead?
+        """
+        avg_bw = sum([r.bandwidth for r in path])/len(path)
+        for size in sorted(self.bw_files.keys()):
+            if avg_bw*5 < size:
+                return size
+        return max(self.bw_files.keys())
+
+    def choose_url(self, path):
+        return self.get_base_url() + '/' + self.bw_files[self.choose_file_size(path)][0]
+
+
 class BwScan(object):
     def __init__(self, state, clock, measurement_dir, **kwargs):
         """
@@ -44,15 +82,8 @@ def __init__(self, state, clock, measurement_dir, **kwargs):
 
         self.tasks = []
         self.circuits = None
-        self.baseurl = 'https://bwauth.torproject.org/bwauth.torproject.org'
-        self.bw_files = {
-            64*1024: ("64M", "913b3c5df256d62235f955fa936e7a4e2d5e0cb6"),
-            32*1024: ("32M", "a536076ef51c2cfff607fec2d362671e031d6b48"),
-            16*1024: ("16M", "e91690ed2abf05e347b61aafaa23abf2a2b3292f"),
-            8*1024: ("8M", "c690229b300945ec4ba872b80e8c443e2e1750f0"),
-            4*1024: ("4M", "94f7bc6679a4419b080debd70166c2e43e80533d"),
-            2*1024: ("2M", "9793cc92932598898d22497acdd5d732037b1a13"),
-        }
+        self.BwFiles = BwFiles()
+        #self.BwFiles.add_url('https://bwauth.torproject.org/bwauth.torproject.org')
 
         self.result_sink = ResultSink(self.measurement_dir, chunk_size=10)
 
@@ -62,22 +93,6 @@ def __init__(self, state, clock, measurement_dir, **kwargs):
     def now(self):
         return time.time()
 
-    def choose_file_size(self, path):
-        """
-        Choose bandwidth file based on average bandwidth of relays on
-        circuit.
-
-        XXX: Should we just use the bandwidth of the measured relay instead?
-        """
-        avg_bw = sum([r.bandwidth for r in path])/len(path)
-        for size in sorted(self.bw_files.keys()):
-            if avg_bw*5 < size:
-                return size
-        return max(self.bw_files.keys())
-
-    def choose_url(self, path):
-        return self.baseurl + '/' + self.bw_files[self.choose_file_size(path)][0]
-
     def run_scan(self):
         all_done = defer.Deferred()
         if self.scan_continuous:
@@ -107,26 +122,28 @@ def scan_over_next_circuit():
         return all_done
 
     def fetch(self, path):
-        url = self.choose_url(path)
         assert None not in path
-        log.info("Downloading file '{file_size}' over [{relay_fp}, {exit_fp}].",
-                 file_size=url.split('/')[-1], relay_fp=path[0].id_hex, exit_fp=path[-1].id_hex)
-        file_size = self.choose_file_size(path)  # File size in MB
+        url = self.BwFiles.choose_url(path)
+        file_size = self.BwFiles.choose_file_size(path)
+        file_name = url.split('/')[-1]
+        log.info("Downloading file '{file_name}' over [{relay_fp}, {exit_fp}].",
+                 file_name=file_name, relay_fp=path[0].id_hex, exit_fp=path[-1].id_hex)
         time_start = self.now()
 
         @defer.inlineCallbacks
         def get_circuit_bw(result):
             time_end = self.now()
-            if len(result) != file_size * 1024:
+            #XXX: this is sketchy
+            if len(result) != file_size:
                 raise DownloadIncomplete
             report = dict()
             report['time_end'] = time_end
             report['time_start'] = time_start
             request_duration = report['time_end'] - report['time_start']
-            report['circ_bw'] = int((file_size * 1024) // request_duration)
+            report['circ_bw'] = (len(result) // request_duration)
             report['path'] = [r.id_hex for r in path]
             log.debug("Download took {duration} for {size} MB", duration=request_duration,
-                      size=int(file_size // 1024))
+                      size=int(len(result)// M))
 
             # We need to wait for these deferreds to be ready, we can't serialize
             # deferreds.

diff --git a/test/test_measurement.py b/test/test_measurement.py
@@ -38,7 +38,7 @@ def test_scan_chutney(self):
         # check that each run is producing the same input set!
         self.tmp = mkdtemp()
         scan = BwScan(self.tor_state, reactor, self.tmp)
-        scan.baseurl = 'http://127.0.0.1:{}'.format(self.port)
+        scan.BwFiles.add_url('http://127.0.0.1:{}'.format(self.port))
 
         def check_all_routers_measured(measurement_dir):
             """