In [None]:
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

In [1]:
# once i have wholegenome_offset.trees
import pandas as pd
import tskit
import allel
import random
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tsinfer
import pyslim
import os
from collections import defaultdict

In [None]:
## create th einitital python file before the simulations, to set up the architectures, create variances, and optimas 

In [None]:
#### params

In [None]:
og_vcf_offset = snakemake.input['og_vcf_offset'] 
og_tree_offset = snakemake.input['og_tree_offset'] 

pi_option =  snakemake.params['pi']
pi = int(snakemake.params[pi_option])

beta_option = snakemake.params['beta']
beta = int(snakemake.params[beta_option])

alelle_freq_option = snakemake.params['allele_freq']
allele_freq = snakemake.params[alelle_freq_option]
lower_bound = float(allele_freq[0])
upper_bound = float(allele_freq[1])


#get the actual values
optima_qty = str(snakemake.params['optima_qty']) 

output_tree_seq_causalloci = snakemake.output["tree_seq_causalloci"]
output_loci_effectsize = snakemake.output["loci_effectsize"]
output_phenotypes = snakemake.output["phenotypes"]
output_optima_values = snakemake.output["optima_values"]
output_variance_values = snakemake.output["variance_values"]

In [87]:
## params hardcoded:
og_vcf_offset = '../treeseq/wholegenome_offset.vcf'
og_tree_offset = '../treeseq/wholegenome_offset_baselinetree.trees'

pi = 20
beta = 1

allele_freq = [0.2, 0.3]
lower_bound = float(allele_freq[0])
upper_bound = float(allele_freq[1])


#get the actual values
optima_qty = 5


In [88]:
## functions 

In [89]:
def calc_pos_sc(alt_al_per_pos, pos, n_ecotypes, allele_freq, pi, beta):
    alt_al_count = alt_al_per_pos.sum(axis=1)
    alelle_dist = pd.DataFrame({'alt_al_count':alt_al_count, 'pos':pos})
    alelle_dist['alt_al_freq'] = alelle_dist['alt_al_count'] / (n_ecotypes*2)
    sim_freq_pos = alelle_dist[(alelle_dist['alt_al_freq'] < upper_bound) & (alelle_dist['alt_al_freq'] >= lower_bound)]['pos']
    selected_sites = sim_freq_pos.sample(pi).values
    sc = np.random.normal(0, beta, pi)
    pos_sc = pd.DataFrame({'pos': selected_sites, 'sc': sc})
    return pos_sc

In [90]:
def calc_phenotypes(pos,pos_sc, alt_al_per_pos):
    mask_positions = pd.Series(pos).isin(pos_sc['pos'])
    alt_al_per_pos_selected_sites = alt_al_per_pos[mask_positions]
    phenotypes = []
    for i in range(alt_al_per_pos_selected_sites.shape[1]):
        gen_effectsize = np.multiply(alt_al_per_pos_selected_sites[:, i] , pos_sc['sc'])
        phenotypes.append(gen_effectsize.sum())
    return phenotypes

In [91]:
def calc_optima(phenotypes):
    max_pheno = max(phenotypes)
    min_pheno = min(phenotypes)

    length = max_pheno - min_pheno
    step = length/(int(optima_qty) - 1)
    optima = [round(min_pheno + i * step, 4) for i in range(0, int(optima_qty))]
    return optima

In [92]:
def calc_variances(phenotypes, optima):
    range_pheno =  max(phenotypes) - min(phenotypes)
    dist_between_env = range_pheno / len(optima)
    ## Strong selection, fitness 0 in the adyacent environemnt
    sd1 = dist_between_env / 3
    variance1 = sd1**2
    ## moderate selection, fitness 0 in the other extremee environemnt 
    sd2 = (dist_between_env * 4) / 3 ## 3 sd will be in between 4  environments 
    variance2 = sd2**2
    ## weak selection, half fitness in the other extreme environment 
    sd3 = (dist_between_env * 8) / 3
    variance3 = sd3**2
    variances = [sd1, sd2, sd3]
    return variances

def keep_only_causal_sites_and_mutations(og_tree_offset, pos_sc):
    ts = tskit.load(og_tree_offset)

    # dumpt the tables from the tree
    tables = ts.dump_tables()

    ## extract all teh sites from the og tree
    complete_sites = pd.Series(tables.sites.position)

    # create a mask to filter only the ones present in the selected sites (causal loci)
    mask_delete_sites = complete_sites.isin(pos_sc['pos'])

    ## replace the table only with the causal sites, and same for mutation tables
    tables.sites.replace_with(tables.sites[mask_delete_sites])
    tables.mutations.replace_with(tables.mutations[mask_delete_sites])
    ## extract the new site index
    tables.mutations.site = np.array(range(0, len(tables.mutations))).astype('int32')

    ## ge tthe positions and sc in teh right order 
    pos_table = pd.Series(tables.sites.position).reset_index()
    right_order_pos = pos_sc.merge(pos_table, left_on='pos',right_on =0).sort_values('index')

    ## create the tree to then modify it 
    new_ts = tables.tree_sequence()
    tables = new_ts.dump_tables()

    ## chance the ancestral state to empty or slim will complain
    tables.sites.clear()
    for s in new_ts.sites():
        tables.sites.append(s.replace(ancestral_state=""))

    ## add the selection coefficient and the rigth emtadata fro slim 
    tables.mutations.clear()
    for k, (m, sc) in enumerate(zip(new_ts.mutations(), right_order_pos['sc'])):
        mm = pyslim.default_slim_metadata('mutation_list_entry')
        mm['selection_coeff'] = sc
        tables.mutations.append(
            m.replace(derived_state=str(k), metadata={'mutation_list': [mm]}))
        
    return tables.tree_sequence()

In [93]:
## for this im gonna use the og vcf file wth the offset to be able to map the positions correctly 
vcf_og = allel.read_vcf(og_vcf_offset, fields=["calldata/GT", 'variants/POS' , 'samples'])
geno_og = vcf_og["calldata/GT"]
samples = vcf_og['samples']
pos = vcf_og['variants/POS']

n_ecotypes = len(vcf_og['samples'])
alt_al_per_pos = geno_og.sum(axis=2) 

pos_sc = calc_pos_sc(alt_al_per_pos, pos, n_ecotypes, allele_freq, pi, beta)

In [133]:


phenotypes = calc_phenotypes(pos,pos_sc, alt_al_per_pos)

optima = calc_optima(phenotypes)

variances = calc_variances(phenotypes, optima)

variances

pd.DataFrame(index =  ['strongsel','moderatesel','lowsel'],data = { 'var': variances}).to_csv('sel_var.csv')

pd.DataFrame(data = {'selection': ['strongsel','moderatesel','lowsel'], 'var': variances}).to_csv('sel_var.csv')

## save 

pd.Series(phenotypes).to_csv(output_phenotypes)
pos_sc.to_csv(output_loci_effectsize)

with open(output_optima_values, 'w') as file:
    for element in optima:
        file.write(str(element) + '\n')  # Write element followed by a newline

with open(output_variance_values, 'w') as file:
    for element in variances:
        file.write(str(round(element,4)) + '\n')  # Write element followed by a newline

### filter tree

pre_slim_tree = keep_only_causal_sites_and_mutations(og_tree_offset, pos_sc)

## save tree

pre_slim_tree.dump(output_tree_seq_causalloci)

In [80]:
## tree postprocesssing 

In [85]:
og_tree_offset = snakemake.input['og_tree_offset'] 
mapper_realid_metadataid = snakemake.input['mapper_ids'] 
output_sim_tree = snakemake.input['output_sim_tree'] 
output_sim_tree_wm = snakemake.output['output_sim_tree_wm'] \
output_vcf = snakemake.output['output_vcf'] 

def overlap_neutral_mut (ts_new, ts, mapper_realid_metadataid):
    ## extract surviving ndoes and comapre them to our old ndoes to place mtuations in the right place
    surviving_nodes = []
    for i in ts_new.tables.nodes:
        surviving_nodes.append(i.metadata['slim_id'])
    ## new nodes id and the ids i gave them in the past
    new_mapper = pd.DataFrame({'new_ids': range(0, len(ts_new.tables.nodes)), 'my_ids_metadata':surviving_nodes})
    ## map old nodes with new nodes
    mapper_lost_nodes = new_mapper.merge(mapper_realid_metadataid, left_on = 'my_ids_metadata', right_on = 'my_ids_metadata', how= 'right')

    ## create a mask to only keep from the old nodes the ones that survived the simulation
    mask = mapper_lost_nodes['new_ids'].notna()

    tables_og = ts.dump_tables()

    ## now filter old tables only based on surviving nodes 
    tables_og.nodes.replace_with(tables_og.nodes[mask])

    ## now filter mutation table based on the surviving nodes, for that, extract the nodes 
    old_nodes = tables_og.mutations.node

    old_nodes = pd.Series(old_nodes)

    old_nodes.name = 'old_nodes'

    ## create a dataframe relating the new and old nodes
    replace_oldbynew_nodes = pd.merge(old_nodes, mapper_lost_nodes, left_on ='old_nodes', right_on = 'real_id', how= 'left')

    ## create a mask to filter out all the mutations than has been lost 
    mask_mutations_lost = replace_oldbynew_nodes['new_ids'].notna()

    ## filter out mutations that has been lost 
    table_mutations = tables_og.mutations[mask_mutations_lost]

    ## now replace the old nodes ids by the new nodes ids with the mapper
    ids_to_replace = replace_oldbynew_nodes.dropna()['new_ids']
    table_mutations.node = np.array(ids_to_replace.astype('int32'))

    ## and jsut set the sites from 0 to the length of mutation table 
    table_mutations.site = np.array(range(0, len(table_mutations))).astype('int32')

    ## apply the same filter from the mutations table to the sites table 
    table_sites = tables_og.sites[mask_mutations_lost]  

    ## now replace all this filter old tables in the new tree seq! 
    new_tables = ts_new.dump_tables()

    new_tables.mutations.replace_with(table_mutations)

    new_tables.sites.replace_with(table_sites)

    ## make sure to compure mutations parents
    new_tables.compute_mutation_parents()

    ## create tree seq based on tables
    tree_nm = new_tables.tree_sequence()

    return tree_nm.simplify()

def convert_tree_to_vcf (tree,name_vcf):
    # create a vcf file from the treeseq 
    with open(name_vcf, 'w') as file:
        # Pass the file object as the output parameter
        tree.write_vcf(output=file)

#import the old tree
ts_old = tskit.load(og_tree_offset)
#import mapper old nodes to new nodes
mapper_realid_metadataid = pd.read_csv(mapper_realid_metadataid)

## ts new

if os.path.exists(output_sim_tree) and os.path.getsize(output_sim_tree) <= 1:
    print('empty_tree')
    with open(output_vcf, "w"):
        pass  # Create an empty vcf file 
    with open(output_sim_tree_wm, "w"):
        pass  # Create an empty tree file 
elif os.path.exists(output_sim_tree) and os.path.getsize(output_sim_tree) > 1:
    ts_new = tskit.load(output_sim_tree)
    ts_nm = overlap_neutral_mut(ts_new, ts_old, mapper_realid_metadataid)
    ts_nm.dump(output_sim_tree_wm)
    convert_tree_to_vcf(ts_nm, output_vcf)

In [None]:
    ts = tskit.load(og_tree_offset)

    # dumpt the tables from the tree
    tables = ts.dump_tables()

    ## extract all teh sites from the og tree
    complete_sites = pd.Series(tables.sites.position)

    # create a mask to filter only the ones present in the selected sites (causal loci)
    mask_delete_sites = complete_sites.isin(pos_sc['pos'])

    ## replace the table only with the causal sites, and same for mutation tables
    tables.sites.replace_with(tables.sites[mask_delete_sites])
    tables.mutations.replace_with(tables.mutations[mask_delete_sites])
    ## extract the new site index
    tables.mutations.site = np.array(range(0, len(tables.mutations))).astype('int32')

    ## ge tthe positions and sc in teh right order 
    pos_table = pd.Series(tables.sites.position).reset_index()
    right_order_pos = pos_sc.merge(pos_table, left_on='pos',right_on =0).sort_values('index')

    ## create the tree to then modify it 
    new_ts = tables.tree_sequence()
    tables = new_ts.dump_tables()

    ## chance the ancestral state to empty or slim will complain
    tables.sites.clear()
    for s in new_ts.sites():
        tables.sites.append(s.replace(ancestral_state=""))

    ## add the selection coefficient and the rigth emtadata fro slim 
    tables.mutations.clear()
    for k, (m, sc) in enumerate(zip(new_ts.mutations(), right_order_pos['sc'])):
        mm = pyslim.default_slim_metadata('mutation_list_entry')
        mm['selection_coeff'] = sc
        tables.mutations.append(
            m.replace(derived_state=str(k), metadata={'mutation_list': [mm]}))
        
    return tables.tree_sequence()

In [12]:
path = '/home/tbellagio/scratch/slim_grenenet/'

In [10]:
import pyslim
import tskit

baseline_tree = tskit.load('og_tree_offset.trees')

tables = baseline_tree.dump_tables()

tables.sites.clear()
for s in baseline_tree.sites():
    tables.sites.append(s.replace(ancestral_state=""))

tables.mutations.clear()
mm = pyslim.default_slim_metadata('mutation_list_entry')
for k, m in enumerate(baseline_tree.mutations()):
    tables.mutations.append(
        m.replace(derived_state=str(k), metadata={'mutation_list': [mm]}))

og_tree_offset_for_slim = tables.tree_sequence()

og_tree_offset_for_slim.dump('og_tree_offset_for_slim.trees')

In [16]:
og_tree = tskit.load(path + 'data/og_tree_offset_for_slim.trees')

In [18]:
og_tree

Tree Sequence,Unnamed: 1
Trees,1674685
Sequence Length,119146348.0
Time Units,uncalibrated
Sample Nodes,462
Total Size,758.0 MiB
Metadata,dict  SLiM:  dict  cycle: 1 description: file_version: 0.8 model_type: nonWF name: nucleotide_based: False separate_sexes: False spatial_dimensionality: spatial_periodicity: stage: early tick: 1

Table,Rows,Size,Has Metadata
Edges,8960512,273.5 MiB,
Individuals,231,22.5 KiB,✅
Migrations,0,8 Bytes,
Mutations,3235480,184.1 MiB,✅
Nodes,2146164,77.8 MiB,✅
Populations,1,2.5 KiB,✅
Provenances,1,587 Bytes,
Sites,3235480,154.3 MiB,✅


In [17]:
og_tree.simplify()

Tree Sequence,Unnamed: 1
Trees,1622022
Sequence Length,119146348.0
Time Units,uncalibrated
Sample Nodes,462
Total Size,862.3 MiB
Metadata,dict  SLiM:  dict  cycle: 1 description: file_version: 0.8 model_type: nonWF name: nucleotide_based: False separate_sexes: False spatial_dimensionality: spatial_periodicity: stage: early tick: 1

Table,Rows,Size,Has Metadata
Edges,11929901,364.1 MiB,
Individuals,231,22.5 KiB,✅
Migrations,0,8 Bytes,
Mutations,3235480,184.1 MiB,✅
Nodes,1899499,68.8 MiB,✅
Populations,1,2.5 KiB,✅
Provenances,2,1.1 KiB,
Sites,3235480,154.3 MiB,✅


In [13]:
og_tree_offset_after_cloning = tskit.load(path + 'data/og_tree_offset_after_cloning.trees')

In [19]:
og_tree_offset_after_cloning = og_tree_offset_after_cloning.simplify()

In [20]:
def convert_tree_to_vcf (tree,name_vcf):
    # create a vcf file from the treeseq 
    with open(name_vcf, 'w') as file:
        # Pass the file object as the output parameter
        tree.write_vcf(output=file)

In [21]:
convert_tree_to_vcf(og_tree_offset_after_cloning, path + 'data/wholegenome_offset_after_cloning.vcf')

In [None]:
RUN THE CORECTION FOR POSITIONS 

In [None]:
og_tree_offset_after_cloning

In [15]:
og_tree_offset_after_cloning.simplify()

Tree Sequence,Unnamed: 1
Trees,1622022
Sequence Length,119146348.0
Time Units,ticks
Sample Nodes,5082
Total Size,862.9 MiB
Metadata,dict  SLiM:  dict  cycle: 1 file_version: 0.8 model_type: nonWF name: sim nucleotide_based: False separate_sexes: False spatial_dimensionality: spatial_periodicity: stage: late tick: 1

Table,Rows,Size,Has Metadata
Edges,11934521,364.2 MiB,
Individuals,2541,248.1 KiB,✅
Migrations,0,8 Bytes,
Mutations,3235480,184.1 MiB,✅
Nodes,1904119,69.0 MiB,✅
Populations,1,2.3 KiB,✅
Provenances,3,4.4 KiB,
Sites,3235480,154.3 MiB,✅
