<a href="https://colab.research.google.com/github/RHMalalatiana/Recombination_code/blob/main/Simulation2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import norm,  probplot
import math as mt

In [2]:
def simulate_distances(tAI, tBI, tCI, lam):
  """
  simulate_distances is a function that simulates the distances
  between the internal sequences I to the external sequences A, B and C,
  following a Poisson distribution.
  Also it calculates the distance between the external sequences A, B and C.
  Input: time divergence, mutation rate
  Output: distance between the internal sequences I to the external sequences A, and B
          distance between A,B and C
  """
  dAI = np.random.poisson(lam * tAI)
  dBI = np.random.poisson(lam * tBI)
  dCI = np.random.poisson(lam * tCI)
    # dAI = stats.poisson.rvs(lam * tAI, size=1)[0]
    # dBI = stats.poisson.rvs(lam * tBI, size=1)[0]
    # dCI = stats.poisson.rvs(lam * tCI, size=1)[0]


  dAB = dAI + dBI
  dAC = dAI + dCI
  dBC = dBI + dCI

  return dAB, dAC, dBC, dAI, dBI, dCI

def compute_z_score(dAI, dBI, lam, tAI, tBI):
  """
  compute_z_score is a function that calculate the Z score for our test statistic S.
  Input: distance between the internal sequences I to the external sequences A, and B
         time divergence, mutation rate
  Output: Z score
  """
  S = abs(dAI - dBI)
  var_S = lam * (tAI + tBI)
  Z = S / mt.sqrt(var_S) if var_S > 0 else 0
  return Z

def estimate_distances(tAI,tBI,tCI,dAB,dAC,dBC):
  """
  estimate_distances is a function that estimate the mutation rate lambda then
  deduced the estimate distances between A,B and C by the estimated distances
  between the internal sequences I to the external sequences A, B and C by that
  value of lamda.
  Input: time divergence, distance between A,B and C
  Output: estimated distance between A,B and C
  """
  lambda_hat=(dAB+dAC+dBC)/(2*(tAI+tBI+tCI))
  dAI_hat=lambda_hat*tAI
  dBI_hat=lambda_hat*tBI
  dCI_hat=lambda_hat*tCI
  dAB_hat = dAI_hat + dBI_hat
  dAC_hat = dAI_hat + dCI_hat
  dBC_hat = dBI_hat + dCI_hat
  return dAB_hat,dAC_hat,dBC_hat

def simulate_case(tAI,tBI,tCI,lam):
  """
  simulate_case simulate a case of recombination or non recombination.
  Input: time divergence, mutation rate
  Output: Z score
  """
  dAB, dAC, dBC, dAI, dBI,dCI = simulate_distances(tAI, tBI, tCI, lam)
  Z = compute_z_score(dAI, dBI, lam, tAI, tBI)
  if Z > 1.96:
    Z = compute_z_score(dAI, dCI, lam, tAI, tCI)
    if Z<1.96:
      cas='non-recombination'
    else:
      cas='recombination'
    return cas
  else:
    cas='non-recombination'
  return cas

In [5]:
# Run simulations Z score
np.random.seed(42)
n_simulations = 1000000
tAI, tBI, tCI = 100.0, 100.0, 300.0
lam=0.03
non_recomb_case = [simulate_case(tAI, tBI, tCI,lam) for _ in range(n_simulations)]
recomb_case = [simulate_case(300.0, 100.0, 100.0,lam) for _ in range(n_simulations)]

In [6]:
false_positive = sum(1 for cas in non_recomb_case if cas=='recombination')
print(f"Number of false positives: {false_positive} which is {false_positive*100/n_simulations}%")

Number of false positives: 24281 which is 2.4281%


In [7]:
false_negative = sum(1 for cas in recomb_case if cas=='non_recombination')
print(f"Number of false negatives: {false_negative} which is {false_negative*100/n_simulations}%")

Number of false negatives: 0 which is 0.0%
