In [1]:
import numpy as np
import plotly.express as px
import pandas as pd

def load_all_ancients(replace_nan=True):
  x = np.load('data/genotype_ancient_refs.npy')
  
  #x_s = select_snps(x)

  # normalization
  genomean_path = 'data/genomean.csv'
  genomean = pd.read_csv(genomean_path, header=0)
  genomean = genomean['x'].values
  snp_drift = np.sqrt((genomean / 2) * (1 - genomean/2))

  # replace nans by genomean
  if replace_nan:
    x_s_i = np.nan_to_num(x, nan=genomean)
  else:
    x_s_i = x

  # normalization
  x_s_i_c = (x_s_i - genomean) / snp_drift

  return x_s_i_c

def compute_embeddings(V, z):   
  index = np.isnan(z)
  z_stern=np.delete(z, index)
  P_stern = np.delete(V[:, 0:2], index, axis = 0)
  dotp = np.dot(P_stern.T, P_stern).astype(float)

  dotz = np.dot(P_stern.T, z_stern).astype(float)
  tau = np.dot(np.linalg.inv(dotp), dotz)
  return tau

def var_discrepency(V_obs, var_tau_r):
  """
  Computes the variance discrepancy between the estimated embedding and the true embedding.

  Parameters:
  - V_obs (numpy.ndarray): Eigenvector matrix at the observed features that were used to compute.
  - var_tau_r (numpy.ndarray): The expected covariance matrix of the PCs (all but the first two).

  Returns:
  - numpy.ndarray: The variance in discrepancy between the estimated embedding and the true embedding.
  """
  matrix_of_linear_map = - np.linalg.inv(V_obs[:, 0:2].T @ V_obs[:, 0:2]) @ V_obs[:, 0:2].T @ V_obs[:, 2:]
  return matrix_of_linear_map @ var_tau_r @ matrix_of_linear_map.T


In [2]:
V = np.load('data/eigenvectors.npy')
Lambda = np.load('data/eigenvalues.npy')

In [3]:
# load data
d = load_all_ancients(replace_nan=False)

In [4]:
# get low-coverage samples
nan_counts = [np.count_nonzero(~np.isnan(d[i])) for i in range(np.shape(d)[0])]

In [5]:
low_coverage_ancient_indices = np.where(np.array(nan_counts)/np.shape(d)[1] < 0.05)[0]
low_coverage_ancients = d[low_coverage_ancient_indices]
print(low_coverage_ancients.shape)

(498, 540247)


In [6]:
high_coverage_ancient_indices = np.where(np.array(nan_counts)/np.shape(d)[1] > 0.7)[0]
high_coverage_ancients = d[high_coverage_ancient_indices]
print(high_coverage_ancients.shape)

(1947, 540247)


In [7]:
modern_df = pd.read_csv('data/embedding_modern_refs.csv')
palette = px.colors.qualitative.Vivid + px.colors.qualitative.Vivid
palette = [px.colors.unconvert_from_RGB_255(px.colors.unlabel_rgb(c)) for c in palette]

metadata = pd.read_csv('data/metadata_ancient_refs.csv', header=0)
metadata_high = metadata.loc[high_coverage_ancient_indices]
metadata = metadata.loc[low_coverage_ancient_indices]
metadata.head()

Unnamed: 0,Genetic_ID,Master_ID,Data_source,Group_ID,political_entity,epoch
53,I14343_d,I14343,1240K,Armenia_EBA_KuraAraxes,Armenia,Chalcolithic (Copper Age) (C)
55,I14346,I14346,1240K,Armenia_EBA_KuraAraxes,Armenia,Late Neolithic (LN)
57,I16706,I16706,1240K,Armenia_EBA_KuraAraxes,Armenia,Late Neolithic (LN)
131,I14605,I14605,1240K,Armenia_LBA_EIA,Armenia,Early Iron Age (EIA)
198,I19340_v54.1_addback,I19340,1240K,Armenia_MBA,Armenia,Chalcolithic (Copper Age) (C)


In [8]:
np.random.seed(100)
selection = np.random.choice(np.arange(0, low_coverage_ancients.shape[0]), size=5, replace=False)
selection_high = np.random.choice(np.arange(0, high_coverage_ancients.shape[0]), size=4, replace=False)
selection = np.sort(selection)
selection_high = np.sort(selection_high)
selection = [136, 140, 353, 459]
print(selection)
s = low_coverage_ancients[selection]
s_high = high_coverage_ancients[selection_high]
print([np.count_nonzero(~np.isnan(s[i])) for i in range(np.shape(s)[0])])
s_emb = np.array([compute_embeddings(V, i) for i in s])
s_emb_high = np.array([compute_embeddings(V, i) for i in s_high])
factors = np.load('data/factors.npy')
vars = []
for i in s:
  observed_inds = ~np.isnan(i)
  var_dis = var_discrepency(V[observed_inds], np.diag(Lambda[2:] * factors[2:]))
  vars.append(var_dis)

vars_high = []
for i in s_high:
  observed_inds = ~np.isnan(i)
  var_dis = var_discrepency(V[observed_inds], np.diag(Lambda[2:] * factors[2:]))
  vars_high.append(var_dis)

metadata_sel = metadata.iloc[selection]
metadata_sel['coverage'] = [np.count_nonzero(~np.isnan(s[i])) for i in range(np.shape(s)[0])]
metadata_sel_high = metadata_high.iloc[selection_high]
metadata_sel_high['coverage'] = [np.count_nonzero(~np.isnan(s_high[i])) for i in range(np.shape(s_high)[0])]
print(metadata_sel.head)
metadata_sel.to_csv('paper_figures/real_world/metadata_ancients_high_uncertainty.csv')
np.save('paper_figures/real_world/variance_ancients_high_uncertainty.npy', np.array(vars))
np.save('paper_figures/real_world/mean_ancients_high_uncertainty.npy', np.array(s_emb))
metadata_sel_high.to_csv('paper_figures/real_world/metadata_ancients_low_uncertainty.csv')
np.save('paper_figures/real_world/variance_ancients_low_uncertainty.npy', np.array(vars_high))
np.save('paper_figures/real_world/mean_ancients_low_uncertainty.npy', np.array(s_emb_high))

[136, 140, 353, 459]
[21772, 9116, 11481, 14892]
<bound method NDFrame.head of             Genetic_ID Master_ID Data_source                        Group_ID  \
2202  STR266b_noUDG.SG   STR266b     Shotgun        Germany_EarlyMedieval.SG   
2248             I2014     I2014       1240K                  Germany_EN_LBK   
5141    VK470_noUDG.SG     VK470     Shotgun                Russia_Viking.SG   
6292   cth842_noUDG.SG    cth842     Shotgun  Turkey_Catalhoyuk_N_Ceramic.SG   

     political_entity                    epoch  coverage  
2202          Germany  Early Middle Ages (EMA)     21772  
2248          Germany               Mesolithic      9116  
5141           Russia  Early Middle Ages (EMA)     11481  
6292           Turkey               Mesolithic     14892  >


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_sel['coverage'] = [np.count_nonzero(~np.isnan(s[i])) for i in range(np.shape(s)[0])]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_sel_high['coverage'] = [np.count_nonzero(~np.isnan(s_high[i])) for i in range(np.shape(s_high)[0])]


In [10]:
np.save('paper_figures/real_world/genotypes_low_uncertainty.npy', np.array(s_high))
np.save('paper_figures/real_world/genotypes_high_uncertainty.npy', np.array(s))