# Process files, compute avg and totals


You have a folder full of .bin files, that are proprietary, and a class BinToTsvConverter with a method .convert(filename), which converts the .bin file to a .tsv file.  The .tsv file has the schema, and no header:

    Total_Connections Latency Bandwidth
    65                  70     20

Calculate the average latency and total bandwidth.

This should take us O(N) where N is the total number of rows in all of the files combined.  If we've got a huge amount of files, we can read the files in parallel and compute the statistics per file, then combine the statistics (MapReduce type stuff).

In [2]:
class BinToTsvConverter:

    def __init__(self):
        """
        initializes the class
        """
    
    def convert(self,filename):
        """
        converts the file and writes filename.tsv
        """

def read_calculate_latency_bandwidth(filenames):
    
    connections_total = 0
    latency_total = 0
    bandwidth_total = 0
    rows_total = 0
    
    converter = BinToTsvConverter()
    
    for filename in filenames:
        converter.convert(filename)
        with open("{}.tsv".format(filename)) as f:
            for line in f.readlines():
                connections, latency, bandwidth = line.split("\t")
                connections_total += connections
                latency_total += latency
                bandwidth_total += bandwidth
                rows_total += 1
    
    return (latency_total / rows_total, bandwidth_total)


# map-reduce style

def read_file_calc_stats(filename):
    converter = BinToTsvConverter()
    converter.convert(filename)
    
    connections_total = 0
    latency_total = 0
    bandwidth_total = 0
    rows_total = 0
    
    with open("{}.tsv".format(filename)) as f:
        for line in f.readlines():
            connections, latency, bandwidth = line.split("\t")
            connections_total += connections
            latency_total += latency
            bandwidth_total += bandwidth
            rows_total += 1
    
    return (connections_total, latency_total, bandwidth_total, rows_total)

def combine_stats(t1, t2):
    return (t1[0] + t2[0], t1[1] + t2[1], t1[2] + t2[2], t1[3] + t2[3])

def compute_latency_bandwidth(summary_stats):
    return (summary_stats[1] / summary_stats[3], summary_stats[2])

# compute_latency_bandwidth(reduce(combine_stats, map(read_file_calc_stats, filenames)))