// This file is part of par2cmdline (a PAR 2.0 compatible file verification and // repair tool). See http://parchive.sourceforge.net for details of PAR 2.0. // // Copyright (c) 2019 Michael D. Nahas // // par2cmdline is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // par2cmdline is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // This program is just to make sure I understand Tornado codes and // how well they perform under loss. // // I'm going to try to calculate the structure of one level of the // code, but not any of the data. Then I'll simulate losses and see // if it can recover. I'll run a few thousand simulations to find out // the distribution of recovery under failures. // Tornado Codes are described in // "Practical Loss-Resilient Codes" by Michael Luby et al. // Below is a list of the symbols in the paper. // // The paper assumes that you have a streaming application. // Each transmission in the stream is a block made of symobls. // For "block" think Par2 file and "symbol" think slice (both // input slices and recovery slices). // // n := The number of symbols in the block. // p := Maximum loss rate you expect to recover from. // // Under a fully-redundent encoding, like Reed-Solomon, you would // send n(1-p) message symbols and n*p check symbols. ("Message // symbols" contain data and "check symbols" contain recovery // information.) // // R := The rate. The portion of block made of message symbols. // // Thus, the number of message symbols is n*R. // For Reed-Solomon, the maximum R is 1-p. // // epsilon := The approximation factor to fully redundent encoding. // With high probability, the authors expect Tornado // Codes to recover the message symbols if R <= 1-p*(1+epsilon) // // Encoding time for Tornado Codes is O(n*ln(1/epsilon)) // Recovery time for Tornado Codes is O(n*ln(1/epsilon)) // // delta := The fraction of symbols lost in transmission // // // Chapter 2. // // C(B) := A one-level code with graph "B". // B := A bipartite graph from message symbols to check symbols. // // n:= REDEFINED! Now, it's the number of message symbols. // beta := The portion of check symbols in one-level of the code. // // So, C(B) has n message symbols and beta*n check symbols. // // Message symbols are bits. Check symbols are created by xor-ing the // values of message symbols that are connected to the check symbol in // the bipartite graph, B. // // epsilon := REDEFINED! The approximation factor for a one-level code. // With high probability, the authors expect the one-level // code to recover the message symbols if R <= 1 - beta*(1-epsilon) // // // C(B_i) := In a multi-level code, the code at level i. // m := In a multi-level code, the number of one-level codes. // // m is chose such that n*beta^(m+1) is approximately sqrt(n). // // lambda_i := The portion of EDGES where the message symbol has // outdegree i in graph B. // That is, they have i connections to check symbols. // rho_j := The portion of EDGES where the check symbols have // indegree j in graph B. // That is, they have j connections to message symbols. // // NOTE: This is NOT the portion of NODES, but edges. I missed // this the first time I read it. A node with out-degree 2 will // have 2 edges, so the portions get distorted. // // (lambda_0, lambda_1, ... lambda_m) is the distribution of edges // by outdegrees. // (rho_0, rho_1, ..., rho_m) is the distribution of edges // by indegrees. // // a_l := Average outdegree of message symbols ("degree on left") // a_r := Average indegree of check symbols ("degree on right") // // Because lambda and rho are defined by edges, not nodes, you get // a_l = 1/(sum_i over (lambda_i / i)) // a_r = 1/(sum_i over (rho_i / i)) // // Algorithm: subgraph is the delta*n missing message symbols and all // beta*n check symbols (and edges between them). NOTE: We // can assume all check symbols are there because they have // be created by the next-higher level of the multi-level // code. // // t := Time. At each step of time, we find a check symbol with // indegree 1, and then remove it and the associated message // symbol. // l_i(t) := portion of EDGES where message symbols have outdegree // i at timestep t. // r_i(t) := portion of EDGES where check symbols have indegree // i at timestep t. // e(t) := portion of edges remaining in the graph // // // // Solution: // H(d) := 1/1 + 1/2 + 1/3 + ... 1/d // "harmonic series to term d" // \approx 0.57721 + 1/(2*d) + ln(d) // lambda_i := 1/(H(d)*(i-1)) // a_l := H(d)*((d+1)/d) // a_r := a_l*beta // // solve for alpha such that: // alpha * e^alpha / (e^alpha -1) = a_r // // then // rho_i = (e^(-alpha)*alpha^(i-1)) / ((i-1)!) // // // CHANGE OF ALGORITHM (pg. 7, top right column) // gamma := fraction of nodes in new graph structure. // // Thus, (beta-gamma)*n check symbols are used to form a subgraph // which is the same as the whole graph before. // // Each message symbol has 3 edges to the gamma*n check symbols // in this new subgraph. // // They say that for any abitrarily low value (eta), they can // choose a gamma that is bigger by a constant. And with that, they // can show that the algorithm runs satisfactorially with high // probability. But they don't say what the constant is. // // NOTE: I'll probably ignore this changed algorithm, // ... unless I hit a problem. #include #include using std::cerr; using std::cout; using std::endl; #include using std::vector; // From page 7, left column: double H(int d) { double result = 0.0; for (int i = 1; i <=d; i++) { result += 1/((double) i); } return result; } // From page 7, left column: // // d controls the maximum degree (which is d+1) // and affects the error (which is 1/d) // // I suggest d = n-1. vector compute_lambda(int d) { vector result; // none of degree 0. result.push_back(0.0); // none of degree 1. result.push_back(0.0); double Hd = H(d); for (int i = 2; i <= d+1; i++) { double lambda_i = 1/(Hd*(i-1)); result.push_back( lambda_i ); cout << " lambda " << i << " = " << result.back() << " H(d)=" << H(d) << endl; } return result; } // n is the number of nodes. // lambda is the distribution of EDGES with outdegree i and computes // the distribution of data slices with outdegree i. vector compute_outdegrees(int n, const vector &lambda) { vector result; double scale_factor = 0.0; // avoid 0 to avoid divide by zero. for (int i = 1; i < lambda.size(); i++) { scale_factor += lambda[i]/i; } // Special case i=0 to avoid divide by 0. result.push_back(0.0); cout << "scale_factor=" << scale_factor << endl; double sum = 0.0; int output_count = 0; for (int i = 1; i < lambda.size(); i++) { sum += (lambda[i]/i)/scale_factor; int next_output_count = (int) round(sum*n); result.push_back(next_output_count - output_count); cout << "i="<< i << " lambda_i=" << lambda[i] << " outdegree=" << (next_output_count - output_count) << endl; output_count = next_output_count; } if (output_count != n) { cerr << "Whoa, something went wrong when calculating the outdegree distribution. " << output_count << " != " << n << endl; exit(1); } return result; } double average_degree(const vector& num_with_degree) { double sum = 0.0; int count = 0; for (int i = 0; i < num_with_degree.size(); i++) { sum += i*num_with_degree[i]; count += num_with_degree[i]; } return sum/count; } int main(int argc, char **argv) { int n = 1000000; int d = 10000; // at most 0.01% worse vector lambda = compute_lambda(d); vector outdegrees = compute_outdegrees(n, lambda); for (int i = 0; i < outdegrees.size(); i++) { cout << i << "\t" << outdegrees[i] << endl; } cout << endl; cout << "average degree = " << average_degree(outdegrees) << endl; cout << "luby expected = " << (H(d)*(d+1)/d) << endl; cout << endl; return 0; }