//  This file is part of par2cmdline (a PAR 2.0 compatible file verification and
//  repair tool). See http://parchive.sourceforge.net for details of PAR 2.0.
//
//  Copyright (c) 2019 Michael D. Nahas
//
//  par2cmdline is free software; you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation; either version 2 of the License, or
//  (at your option) any later version.
//
//  par2cmdline is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with this program; if not, write to the Free Software
//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


//
// This program is just to make sure I understand Tornado codes and
// how well they perform under loss.
//
// I'm going to try to calculate the structure of one level of the
// code, but not any of the data.  Then I'll simulate losses and see
// if it can recover.  I'll run a few thousand simulations to find out
// the distribution of recovery under failures.


// Tornado Codes are described in
// "Practical Loss-Resilient Codes" by Michael Luby et al.

// Below is a list of the symbols in the paper.
//
// The paper assumes that you have a streaming application.
// Each transmission in the stream is a block made of symobls.
// For "block" think Par2 file and "symbol" think slice (both
// input slices and recovery slices).
//
// n :=     The number of symbols in the block.
// p :=     Maximum loss rate you expect to recover from.
//
// Under a fully-redundent encoding, like Reed-Solomon, you would
// send n(1-p) message symbols and n*p check symbols.  ("Message
// symbols" contain data and "check symbols" contain recovery
// information.)
//
// R :=     The rate.  The portion of block made of message symbols.
//
// Thus, the number of message symbols is n*R.  
// For Reed-Solomon, the maximum R is 1-p.
//
// epsilon := The approximation factor to fully redundent encoding.
//            With high probability, the authors expect Tornado
//            Codes to recover the message symbols if R <= 1-p*(1+epsilon)
//
// Encoding time for Tornado Codes is O(n*ln(1/epsilon))
// Recovery time for Tornado Codes is O(n*ln(1/epsilon))
//
// delta :=   The fraction of symbols lost in transmission
//
//
// Chapter 2.
//
// C(B) :=    A one-level code with graph "B".
// B :=       A bipartite graph from message symbols to check symbols.
//
// n:=        REDEFINED!  Now, it's the number of message symbols.
// beta :=    The portion of check symbols in one-level of the code.
//
// So, C(B) has n message symbols and beta*n check symbols.
//
// Message symbols are bits.  Check symbols are created by xor-ing the
// values of message symbols that are connected to the check symbol in
// the bipartite graph, B.
//
// epsilon := REDEFINED!  The approximation factor for a one-level code.
//            With high probability, the authors expect the one-level
//            code to recover the message symbols if R <= 1 - beta*(1-epsilon)
//
//
// C(B_i) :=  In a multi-level code, the code at level i.
// m :=       In a multi-level code, the number of one-level codes.
//
// m is chose such that n*beta^(m+1) is approximately sqrt(n).
//
// lambda_i := The portion of EDGES where the message symbol has
//             outdegree i in graph B.
//             That is, they have i connections to check symbols.
// rho_j :=   The portion of EDGES where the check symbols have
//            indegree j in graph B.
//            That is, they have j connections to message symbols.
//
// NOTE: This is NOT the portion of NODES, but edges.  I missed
// this the first time I read it.  A node with out-degree 2 will
// have 2 edges, so the portions get distorted.  
// 
// (lambda_0, lambda_1, ... lambda_m) is the distribution of edges
//                                    by outdegrees.
// (rho_0, rho_1, ..., rho_m) is the distribution of edges
//                                    by indegrees.
//
// a_l :=     Average outdegree of message symbols ("degree on left")
// a_r :=     Average indegree of check symbols ("degree on right")
//
// Because lambda and rho are defined by edges, not nodes, you get
// a_l = 1/(sum_i over (lambda_i / i))
// a_r = 1/(sum_i over (rho_i / i))
//
// Algorithm: subgraph is the delta*n missing message symbols and all
//            beta*n check symbols (and edges between them).  NOTE: We
//            can assume all check symbols are there because they have
//            be created by the next-higher level of the multi-level
//            code.
//
// t :=       Time.  At each step of time, we find a check symbol with
//            indegree 1, and then remove it and the associated message
//            symbol.
// l_i(t) :=  portion of EDGES where message symbols have outdegree
//            i at timestep t.
// r_i(t) :=  portion of EDGES where check symbols have indegree
//            i at timestep t.
// e(t) :=    portion of edges remaining in the graph
//
//
//
// Solution:
// H(d) :=    1/1 + 1/2 + 1/3 + ... 1/d
//            "harmonic series to term d"
//            \approx 0.57721 + 1/(2*d) + ln(d)
// lambda_i := 1/(H(d)*(i-1))
// a_l := H(d)*((d+1)/d)
// a_r := a_l*beta
//
// solve for alpha such that:
// alpha * e^alpha / (e^alpha -1) = a_r
//
// then
// rho_i = (e^(-alpha)*alpha^(i-1)) / ((i-1)!)
//
//
// CHANGE OF ALGORITHM (pg. 7, top right column)
// gamma := fraction of nodes in new graph structure.
//
// Thus, (beta-gamma)*n check symbols are used to form a subgraph
// which is the same as the whole graph before.
//
// Each message symbol has 3 edges to the gamma*n check symbols
// in this new subgraph.
//
// They say that for any abitrarily low value (eta), they can
// choose a gamma that is bigger by a constant.  And with that, they
// can show that the algorithm runs satisfactorially with high
// probability.  But they don't say what the constant is.
//
// NOTE: I'll probably ignore this changed algorithm,
//       ... unless I hit a problem.

#include <math.h>

#include <iostream>
using std::cerr;
using std::cout;
using std::endl;

#include <vector>
using std::vector;


// From page 7, left column:

double H(int d) {
  double result = 0.0;
  for (int i = 1; i <=d; i++) {
    result += 1/((double) i);
  }
  return result;
}


// From page 7, left column:
//
// d controls the maximum degree (which is d+1)
//   and affects the error (which is 1/d)
//
// I suggest d = n-1.
vector<double> compute_lambda(int d) {
  vector<double> result;

  // none of degree 0.
  result.push_back(0.0);

  // none of degree 1.
  result.push_back(0.0);

  double Hd = H(d);
  
  for (int i = 2; i <= d+1; i++) {
    double lambda_i = 1/(Hd*(i-1));
    result.push_back( lambda_i );

    cout << "  lambda " << i << " = " << result.back() << " H(d)=" << H(d) << endl;
  }

  return result;
}


// n is the number of nodes.
// lambda is the distribution of EDGES with outdegree i and computes
// the distribution of data slices with outdegree i.
vector<int> compute_outdegrees(int n, const vector<double> &lambda) {
  vector<int> result;
  
  double scale_factor = 0.0;

  // avoid 0 to avoid divide by zero.
  for (int i = 1; i < lambda.size(); i++) {
    scale_factor += lambda[i]/i;
  }

  // Special case i=0 to avoid divide by 0.
  result.push_back(0.0); 

  cout << "scale_factor=" << scale_factor << endl;
  
  double sum = 0.0;
  int output_count = 0;
  for (int i = 1; i < lambda.size(); i++) {
    sum += (lambda[i]/i)/scale_factor;
    int next_output_count = (int) round(sum*n);
    result.push_back(next_output_count - output_count);

    cout << "i="<< i << " lambda_i=" << lambda[i] << " outdegree=" << (next_output_count - output_count) << endl;
    output_count = next_output_count;
  }

  if (output_count != n) {
    cerr << "Whoa, something went wrong when calculating the outdegree distribution. " << output_count << " != " << n << endl;
    exit(1);
  }

  return result;
}


double average_degree(const vector<int>& num_with_degree) {
  double sum = 0.0;
  int count = 0;
  
  for (int i = 0; i < num_with_degree.size(); i++) {
    sum += i*num_with_degree[i];
    count += num_with_degree[i];
  }

  return sum/count;
}


int main(int argc, char **argv) {
  int n = 1000000;

  int d = 10000; // at most 0.01% worse
  
  vector<double> lambda = compute_lambda(d);
  vector<int> outdegrees = compute_outdegrees(n, lambda);
  
  for (int i = 0; i < outdegrees.size(); i++) {
    cout << i << "\t" << outdegrees[i] << endl;
  }
  cout << endl;
  cout << "average degree = " << average_degree(outdegrees) << endl;
  cout << "luby expected =  " << (H(d)*(d+1)/d) << endl;
  cout << endl;
  return 0;
}