//  This file is part of par2cmdline (a PAR 2.0 compatible file verification and
//  repair tool). See http://parchive.sourceforge.net for details of PAR 2.0.
//
//  Copyright (c) 2019 Michael D. Nahas
//
//  par2cmdline is free software; you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation; either version 2 of the License, or
//  (at your option) any later version.
//
//  par2cmdline is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with this program; if not, write to the Free Software
//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


//
// This program is just to make sure I understand Tornado codes and
// how well they perform under loss.
//
// I'm going to try to calculate the structure of one level of the
// code, but not any of the data.  Then I'll simulate losses and see
// if it can recover.  I'll run a few thousand simulations to find out
// the distribution of recovery under failures.


// Tornado Codes are described in
// "Practical Loss-Resilient Codes" by Michael Luby et al.

// Below is a list of the symbols in the paper.
//
// The paper assumes that you have a streaming application.
// Each transmission in the stream is a block made of symobls.
// For "block" think Par2 file and "symbol" think slice (both
// input slices and recovery slices).
//
// n :=     The number of symbols in the block.
// p :=     Maximum loss rate you expect to recover from.
//
// Under a fully-redundent encoding, like Reed-Solomon, you would
// send n(1-p) message symbols and n*p check symbols.  ("Message
// symbols" contain data and "check symbols" contain recovery
// information.)
//
// R :=     The rate.  The portion of block made of message symbols.
//
// Thus, the number of message symbols is n*R.  
// For Reed-Solomon, the maximum R is 1-p.
//
// epsilon := The approximation factor to fully redundent encoding.
//            With high probability, the authors expect Tornado
//            Codes to recover the message symbols if R <= 1-p*(1+epsilon)
//
// Encoding time for Tornado Codes is O(n*ln(1/epsilon))
// Recovery time for Tornado Codes is O(n*ln(1/epsilon))
//
// delta :=   The fraction of symbols lost in transmission
//
//
// Chapter 2.
//
// C(B) :=    A one-level code with graph "B".
// B :=       A bipartite graph from message symbols to check symbols.
//
// n:=        REDEFINED!  Now, it's the number of message symbols.
// beta :=    The portion of check symbols in one-level of the code.
//
// So, C(B) has n message symbols and beta*n check symbols.
//
// Message symbols are bits.  Check symbols are created by xor-ing the
// values of message symbols that are connected to the check symbol in
// the bipartite graph, B.
//
// epsilon := REDEFINED!  The approximation factor for a one-level code.
//            With high probability, the authors expect the one-level
//            code to recover the message symbols if R <= 1 - beta*(1-epsilon)
//
//
// C(B_i) :=  In a multi-level code, the code at level i.
// m :=       In a multi-level code, the number of one-level codes.
//
// m is chose such that n*beta^(m+1) is approximately sqrt(n).
//
// lambda_i := The portion of message symbols with outdegree i in graph B.
//             That is, that have i connections to check symbols.
// rho_j :=   The portion of check symbols with indegree j in graph B.
//            That is, that have j connections to message symbols.
// 
// (lambda_0, lambda_1, ... lambda_m) is the distribution of outdegrees.
// (rho_0, rho_1, ..., rho_m) is the distribution of indegrees.
//
// a_l :=     Average outdegree of message symbols ("degree on left")
// a_r :=     Average indegree of check symbols ("degree on right")
//
//
// Algorithm: subgraph is the delta*n missing message symbols and all
//            beta*n check symbols (and edges between them).  NOTE: We
//            can assume all check symbols are there because they have
//            be created by the next-higher level of the multi-level
//            code.
//
// t :=       Time.  At each step of time, we find a check symbol with
//            indegree 1, and then remove it and the associated message
//            symbol.
// l_i(t) :=  portion of message symbols with outdegree i at timestep t.
// r_i(t) :=  portion of check symbols with indegree i at timestep t.
// e(t) :=    portion of edges remaining in the graph
//
// HOW DO WE GET THIS?  (pg. 5, middle of left column of text)
//   e(t) = Sum_i l_i(t) = Sum_i r_i(t)
//
//
// Solution:
// H(d) :=    1/1 + 1/2 + 1/3 + ... 1/d
//            "harmonic series to term d"
//            \approx 0.57721 + 1/(2*d) + ln(d)
// lambda_i := 1/(H(d)*(i-1))
// a_l := H(d)*((d+1)/d)
// a_r := a_l*beta
//
// solve for alpha such that:
// alpha * e^alpha / (e^alpha -1) = a_r
//
// then
// rho_i = (e^(-alpha)*alpha^(i-1)) / ((i-1)!)
//
//
// CHANGE OF ALGORITHM (pg. 7, top right column)
// gamma := fraction of nodes in new graph structure.
//
// Thus, (beta-gamma)*n check symbols are used to form a subgraph
// which is the same as the whole graph before.
//
// Each message symbol has 3 edges to the gamma*n check symbols
// in this new subgraph.
//
// They say that for any abitrarily low value (eta), they can
// choose a gamma that is bigger by a constant.  And with that, they
// can show that the algorithm runs satisfactorially with high
// probability.  But they don't say what the constant is.
//
// NOTE: I'll probably ignore this changed algorithm,
//       ... unless I hit a problem.

#include <math.h>

#include <iostream>
using std::cerr;
using std::cout;
using std::endl;

#include <vector>
using std::vector;


// From page 7, left column:

double H(int d) {
  double result = 0.0;
  for (int i = 1; i <=d; i++) {
    result += 1/((double) i);
  }
  return result;
}


// From page 7, left column:
//
// n is number of input slices
// d controls the maximum degree (which is d+1)
//   and affects the error (which is 1/d)

vector<int> lambda_times_n(int n, int d) {
  vector<int> result;

  // none of degree 0.
  result.push_back(0);

  // none of degree 1.
  result.push_back(0);

  double Hd = H(d);
  
  double sum = 0;
  int num_output = 0;
  for (int i = 2; i <= d+1; i++) {
    if (i > n) {
      cerr << "Something went very wrong.  i > n" << endl;
      exit(1);
    }
    
    double lambda_i = 1/(Hd*(i-1));
    sum += lambda_i;
    int next_num_output = (int) round(sum * n);
    result.push_back(next_num_output - num_output);

    // cout << "  lambda " << i << " = " << result.back() << "  sum=" << sum << " H(d)=" << H(d) << " next_num_output=" << next_num_output << endl;
    
    num_output = next_num_output;
  }

  // at end, num_output should equal n.
  cout << "At end of calculation:  n=" << n << "   num_output=" << num_output << endl;

  return result;
}


double average_degree(const vector<int>& num_with_degree) {
  double sum = 0.0;
  int count = 0;
  
  for (int i = 0; i < num_with_degree.size(); i++) {
    sum += i*num_with_degree[i];
    count += num_with_degree[i];
  }

  return sum/count;
}


int main(int argc, char **argv) {
  int n = 10000;

  int d = 100; // at most 1% worse
  
  vector<int> data_with_degree = lambda_times_n(n, d);
  for (int i = 0; i < data_with_degree.size(); i++) {
    cout << i << "\t" << data_with_degree[i] << endl;
  }
  cout << endl;
  cout << "average degree = " << average_degree(data_with_degree) << endl;
  cout << "luby expected =  " << (H(d)*(d+1)/d) << endl;
  cout << "my expected   =  " << (1 + d/H(d)) << endl;
  cout << endl;
  cout << "H(d)=" << H(d) << endl;
  return 0;
}