// This file is part of par2cmdline (a PAR 2.0 compatible file verification and // repair tool). See http://parchive.sourceforge.net for details of PAR 2.0. // // Copyright (c) 2019 Michael D. Nahas // // par2cmdline is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // par2cmdline is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // This program is just to make sure I understand Tornado codes and // how well they perform under loss. // // I'm going to try to calculate the structure of one level of the // code, but not any of the data. Then I'll simulate losses and see // if it can recover. I'll run a few thousand simulations to find out // the distribution of recovery under failures. // Tornado Codes are described in // "Practical Loss-Resilient Codes" by Michael Luby et al. // Below is a list of the symbols in the paper. // // The paper assumes that you have a streaming application. // Each transmission in the stream is a block made of symobls. // For "block" think Par2 file and "symbol" think slice (both // input slices and recovery slices). // // n := The number of symbols in the block. // p := Maximum loss rate you expect to recover from. // // Under a fully-redundent encoding, like Reed-Solomon, you would // send n(1-p) message symbols and n*p check symbols. ("Message // symbols" contain data and "check symbols" contain recovery // information.) // // R := The rate. The portion of block made of message symbols. // // Thus, the number of message symbols is n*R. // For Reed-Solomon, the maximum R is 1-p. // // epsilon := The approximation factor to fully redundent encoding. // With high probability, the authors expect Tornado // Codes to recover the message symbols if R <= 1-p*(1+epsilon) // // Encoding time for Tornado Codes is O(n*ln(1/epsilon)) // Recovery time for Tornado Codes is O(n*ln(1/epsilon)) // // delta := The fraction of symbols lost in transmission // // // Chapter 2. // // C(B) := A one-level code with graph "B". // B := A bipartite graph from message symbols to check symbols. // // n:= REDEFINED! Now, it's the number of message symbols. // beta := The portion of check symbols in one-level of the code. // // So, C(B) has n message symbols and beta*n check symbols. // // Message symbols are bits. Check symbols are created by xor-ing the // values of message symbols that are connected to the check symbol in // the bipartite graph, B. // // epsilon := REDEFINED! The approximation factor for a one-level code. // With high probability, the authors expect the one-level // code to recover the message symbols if R <= 1 - beta*(1-epsilon) // // // C(B_i) := In a multi-level code, the code at level i. // m := In a multi-level code, the number of one-level codes. // // m is chose such that n*beta^(m+1) is approximately sqrt(n). // // lambda_i := The portion of message symbols with outdegree i in graph B. // That is, that have i connections to check symbols. // rho_j := The portion of check symbols with indegree j in graph B. // That is, that have j connections to message symbols. // // (lambda_0, lambda_1, ... lambda_m) is the distribution of outdegrees. // (rho_0, rho_1, ..., rho_m) is the distribution of indegrees. // // a_l := Average outdegree of message symbols ("degree on left") // a_r := Average indegree of check symbols ("degree on right") // // // Algorithm: subgraph is the delta*n missing message symbols and all // beta*n check symbols (and edges between them). NOTE: We // can assume all check symbols are there because they have // be created by the next-higher level of the multi-level // code. // // t := Time. At each step of time, we find a check symbol with // indegree 1, and then remove it and the associated message // symbol. // l_i(t) := portion of message symbols with outdegree i at timestep t. // r_i(t) := portion of check symbols with indegree i at timestep t. // e(t) := portion of edges remaining in the graph // // HOW DO WE GET THIS? (pg. 5, middle of left column of text) // e(t) = Sum_i l_i(t) = Sum_i r_i(t) // // // Solution: // H(d) := 1/1 + 1/2 + 1/3 + ... 1/d // "harmonic series to term d" // \approx 0.57721 + 1/(2*d) + ln(d) // lambda_i := 1/(H(d)*(i-1)) // a_l := H(d)*((d+1)/d) // a_r := a_l*beta // // solve for alpha such that: // alpha * e^alpha / (e^alpha -1) = a_r // // then // rho_i = (e^(-alpha)*alpha^(i-1)) / ((i-1)!) // // // CHANGE OF ALGORITHM (pg. 7, top right column) // gamma := fraction of nodes in new graph structure. // // Thus, (beta-gamma)*n check symbols are used to form a subgraph // which is the same as the whole graph before. // // Each message symbol has 3 edges to the gamma*n check symbols // in this new subgraph. // // They say that for any abitrarily low value (eta), they can // choose a gamma that is bigger by a constant. And with that, they // can show that the algorithm runs satisfactorially with high // probability. But they don't say what the constant is. // // NOTE: I'll probably ignore this changed algorithm, // ... unless I hit a problem. #include #include using std::cerr; using std::cout; using std::endl; #include using std::vector; // From page 7, left column: double H(int d) { double result = 0.0; for (int i = 1; i <=d; i++) { result += 1/((double) i); } return result; } // From page 7, left column: // // n is number of input slices // d controls the maximum degree (which is d+1) // and affects the error (which is 1/d) vector lambda_times_n(int n, int d) { vector result; // none of degree 0. result.push_back(0); // none of degree 1. result.push_back(0); double Hd = H(d); double sum = 0; int num_output = 0; for (int i = 2; i <= d+1; i++) { if (i > n) { cerr << "Something went very wrong. i > n" << endl; exit(1); } double lambda_i = 1/(Hd*(i-1)); sum += lambda_i; int next_num_output = (int) round(sum * n); result.push_back(next_num_output - num_output); // cout << " lambda " << i << " = " << result.back() << " sum=" << sum << " H(d)=" << H(d) << " next_num_output=" << next_num_output << endl; num_output = next_num_output; } // at end, num_output should equal n. cout << "At end of calculation: n=" << n << " num_output=" << num_output << endl; return result; } double average_degree(const vector& num_with_degree) { double sum = 0.0; int count = 0; for (int i = 0; i < num_with_degree.size(); i++) { sum += i*num_with_degree[i]; count += num_with_degree[i]; } return sum/count; } int main(int argc, char **argv) { int n = 10000; int d = 100; // at most 1% worse vector data_with_degree = lambda_times_n(n, d); for (int i = 0; i < data_with_degree.size(); i++) { cout << i << "\t" << data_with_degree[i] << endl; } cout << endl; cout << "average degree = " << average_degree(data_with_degree) << endl; cout << "luby expected = " << (H(d)*(d+1)/d) << endl; cout << "my expected = " << (1 + d/H(d)) << endl; cout << endl; cout << "H(d)=" << H(d) << endl; return 0; }