In [1]:
import dendropy
import pandas as pd
from sys import argv
import argparse
from random import randint

In [5]:
def pull_data_taxon(fossil_df, **kwargs):
    '''Pull the oldest fossil in a group. Mandatory: what level (i.e., subfamily, tribe, etc).'''
    foss_list = []
    for key, value in kwargs.items():
        try:
          kwargs["level"]
        except KeyError:
          raise KeyError('level is required is a Required Argument that tells the program from which \\
                         taxonomic group to sample. Options include subfamily, tribe, genus')  
        try:
          kwargs["age"]
        except KeyError:
          raise KeyError('age is required is a Required Argument that specifies how to sample \\
                          within a taxonomic group. Options include oldest, youngest, random')          
        if key == "level":
            group_key = value.lower()
        if key == "age":
            age_key = value.lower()
            if age_key == "oldest":
                if "fraction" in kwargs.keys():
                    num_key = kwargs["fraction"]
                    oldest_df = fossil_df.groupby(group_key).apply(lambda x: x.nlargest(int(len(x) * num_key), 'max_ma'))[["taxon","max_ma", group_key]]
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df = oldest_df.drop(group_key, axis=1) 
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)
                else:
                    oldest_df = fossil_df.groupby([group_key]).max()[["max_ma", "taxon"]]
                    oldest_df = oldest_df.reset_index()
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df = oldest_df.drop(group_key, axis=1) 
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)
            elif age_key == "youngest":
                if "fraction" in kwargs.keys():
                    num_key = kwargs["fraction"]
                    oldest_df = fossil_df.groupby(group_key).apply(lambda x: x.nsmallest(int(len(x) * num_key), 'max_ma'))[["taxon","max_ma", group_key]]
                    #                    oldest_df = oldest_df.reset_index()
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df = oldest_df.drop(group_key, axis=1) 
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)
                else:
                    oldest_df = fossil_df.groupby([group_key]).min()[["max_ma", "taxon"]]
                    oldest_df = oldest_df.reset_index()
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df = oldest_df.drop(group_key, axis=1) 
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)
            elif age_key == "random":                    
                if "fraction" in kwargs.keys():
                    num_key = kwargs["fraction"]
                    oldest_df = fossil_df.groupby(group_key).apply(lambda x: x.sample(frac=num_key))[["taxon","max_ma", group_key]]
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df = oldest_df.drop(group_key, axis=1) 
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)                    
                else:
                    oldest_df = fossil_df.groupby(group_key).apply(lambda x: x.sample(1))[["max_ma", "taxon"]]
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)

In [6]:
pull_data_taxon(fossil_df, level="genus", age="oldest", fraction=.5)

In [7]:
def pull_data_sampling(fossil_df, **kwargs):
    '''Pull fossils relative to time. Mandatory keyword: Strategy. Options: uniform (freq=int), time_dep, time
    stratified. If time-stratified, must also provide a list of time bins (with time_bins = list or df)'''
    oldest = float(fossil_df[["max_ma"]].max())
    assert len(kwargs.items()) > 0, "No required args provided. Must provide sampling strategy. \
                                     Options: uniform, diversified, time stratified."
    for key, value in kwargs.items():
        try:
          kwargs["strategy"]
        except KeyError:
          raise KeyError('strategy is a Required Argument that tells the program how to sample fossils through time. \\
                          Options: uniform, time_dep, time stratified. If time-stratified, provide a list of bins.')  
        if "time-stratified" in kwargs.values():
            try:
                kwargs["time_bins"]
            except KeyError:
                raise KeyError('For time-binned sampling, time bins must be specified withe the time_bins kwarg. \
                                Input may be a list of lists specifying sampling, or a dataframe of time bins') 
        if key == "strategy":
            type_key = value.lower()
        if type_key == "uniform":
            if "freq" in kwargs.keys():
                samp_freq = kwargs["freq"]
            else:
                samp_freq = .1
                print("Uniform sampling indicated, but no sliding window width. Will assume window is 10% of  \\
                      age of oldest fossil")
        if type_key == "time-dep":
            if "multiplier" in kwargs.keys():
                multi = kwargs["multiplier"]
            else:
                print("Time dependent sampling indicated, but no multiplier. Will assume sampling frequency \\
                      increases 10% each time bin towards the present")
                samp_freq = 1.1
            if "freq" in kwargs.keys():
                samp_freq = kwargs["freq"]
            else:
                samp_freq = .1
                print("Time dependent sampling indicated, but no sliding window width. Will assume window is 10% of  \\
                      age of oldest fossil")
    bin = float(oldest)*samp_freq
    num_bins = round(oldest/bin)
    bottom_interval = oldest - bin
    l = pd.DataFrame()
    
    if type_key == "uniform":

        for x in range(0,num_bins):
            bottom_interval = oldest - (bin*(x+1))
            top_interval = oldest - (bin*x)
            tmp_df = fossil_df[(fossil_df['max_ma'] >= bottom_interval) & (fossil_df['max_ma'] <= top_interval)]
            if len(tmp_df) > 1:
                if "number" in kwargs:
                    numb = kwargs["number"]
                    if len(tmp_df) >= numb:
                        l = l.append(tmp_df.sample(numb))
                    else:
                        l = l.append(tmp_df.sample(len(tmp_df)))
                else: 
                    l = l.append(tmp_df.sample(1)) 
            else:
                pass
            
    if type_key == "time-dep":
        for x in range(0,num_bins):

            bottom_interval = oldest - (bin*(x+1))
            top_interval = oldest - (bin*x)
            tmp_df = fossil_df[(fossil_df['max_ma'] >= bottom_interval) & (fossil_df['max_ma'] <= top_interval)]
            if len(tmp_df) > 1:
                if multi > 1:
                    multi = 1
                numb = round(len(tmp_df)*multi)
                l = l.append(tmp_df.sample(numb))
            else:
                pass      
            multi = multi * (1 + multi)
            
    l = l.drop(['notes', 'reference_no', 'tribe', 'min_ma', 'fossil'], axis=1)
    l[["taxon","age", "subfamily", "genus"]] = l[["taxon","max_ma", "subfamily", "genus"]]
    l.to_csv("../Data/fossil_tax_unit.tsv", index=False)
    l = l.drop(['subfamily', 'genus','max_ma'], axis=1)
    l.to_csv("../Data/fossil_sample.tsv", index=False)

In [8]:
pull_data_sampling(fossil_df, strategy="uniform", multiplier = .05, freq = .1)

In [19]:
def make_combined_data(fossil_df, phylo_dat, fossils, extant_df, mol):
    names_taxon = pd.DataFrame()
    tree_names = phylo_dat.taxon_namespace
    print(tree_names)
    for name in tree_names.labels():
        if name in fossil_df.taxon.values: 
            names_taxon = names_taxon.append(fossil_df[fossil_df['taxon'].str.contains(name)])
        elif name not in fossil_df.taxon.values:
            names_taxon = names_taxon.append(extant_df[extant_df['taxon'].str.contains(name)])
        elif name not in extant_df.taxon.values:
            names_taxon = names_taxon.append(mol[mol['taxon'].str.contains(name)])            
        else:
            names_taxon = names_taxon.append(name)
            print("{} is not contained in any morphology file, added without data".format(name))  
    names_taxon = names_taxon[["taxon", "genus", "subfamily"]]
    print(names_taxon)
    return(names_taxon)

In [20]:
names_df = make_combined_data(fossil_df, phylo_dat, fossils, extant_df, mol)

['Acanthoponera_minor', 'Amblyopone_pallipes', 'Aneuretus', 'Anomalomyrma_sp', 'Apomyrma_stygia', 'Brownimecia_clavata', 'Camelomecia', 'Cerapachys_sexspinus', 'Chalybion_californicum', 'Chyphotes_mellipes', 'Formica_fusca', 'Gerontoformica_gracilis', 'Gerontoformica_magnus', 'Gerontoformica_pilosus', 'Gerontoformica_spiralis', 'Haidomyrmex_scimitarus', 'Haidomyrmodes_mammuthus', 'Haidoterminus_cippus', 'Heterogyna', 'Hypoponera_opacior', 'Kyromyrma', 'Lasius_californicus', 'Leptanilla_swani', 'Leptanilloides_nomada', 'Leptogenys_diminuta', 'Martialis_heureka', 'Metapolybia_cingulata', 'Myanmyrma_gracilis', 'Myrmecia_nigriceps', 'Myrmica_americana', 'Nothomyrmecia_macrops', 'Opamyrma_hungvuong', 'Paraponera_clavata', 'Platythyrea_punctata', 'Pogonomyrmex_californicus', 'Proceratium_stictum', 'Scolia_verticalis', 'Sphecomyrma_freyi', 'Tatuidris_tatusia', 'Tetraponera_punctulata', 'Zigrasimecia', 'Adetomyrma_sp.', 'Amblyopone_armigera', 'Amblyopone_australis', 'Amblyopone_mercovichi', 'A

In [20]:
def make_taxon_set(names_df, **kwargs):
    n_l = []
    nl_dict = {}
    lines = []
    new_file_name = "model_FBDP_0.Rev"
    assert len(kwargs.items()) > 0, "No required args provided. Must provide taxonomic level to construct taxon sets"
    for key, value in kwargs.items():
        try:
          kwargs["level"]
        except KeyError:
          raise KeyError('level is a Required Argument that tells the program how to construct clade contraints')  
    if key == "level":
        group_key = str(value.lower()) 
        n_l = names_df[group_key].unique()
        results = names_df.groupby(group_key)['taxon']
    for name in n_l:
        nl_dict[name] = results.get_group(name).tolist()
    for key, value in nl_dict.items():
        for item in value:
            item = item.strip()
        sentence = key + " = clade(\"" + "\",\"".join(value) + "\")" + "\n"
        lines.append(sentence)
    tax_list = "constraints=v(" + ",".join(str(x) for x in nl_dict.keys()) + ")"
    try:
        outfile = open(new_file_name,'r+')
    except:
# if file does not exist, create it
        outfile = open(new_file_name,'w')
#    with open(new_file_name) as outfile:
    with open("model_FBDP.Rev", "r") as infile:
        for line in infile: 
            if "INSERT1" in line:
                outfile.write(line.replace("INSERT1", "".join(str(x) for x in lines)))
            elif "INSERT2" in line:
                outfile.write(line.replace("INSERT2", tax_list))
            else:
                outfile.write(line)
    outfile.close()

In [21]:
make_taxon_set(names_df, level = "subfamily")

In [18]:
if __name__ == "__main__":
	parser = argparse.ArgumentParser()

	parser.add_argument("--fossil", help="Path to the csv of fossils.")
	parser.add_argument("--ages", help="Path to data ages TSV or CSV file containing \
	ages of non-contemporaneous tips, if any exist in your analysis.")
	parser.add_argument("--sample", help="How to sample fossils. Options: oldest, proportional, sampling.")
	parser.add_argument("--output", help="Path to where you'd like to write output")
	args = parser.parse_args()
	if args.set:
		df = args.set
	if args.ages:
		tnrs = args.ages
	if args.output:
		outfile = args.output		
		
	foss_tax = parse_dataframe(df)

usage: ipykernel_launcher.py [-h] [--fossil FOSSIL] [--ages AGES]
                             [--sample SAMPLE] [--output OUTPUT]
ipykernel_launcher.py: error: unrecognized arguments: -f /run/user/1000/jupyter/kernel-850356df-1421-4ba5-a59d-a982537519d8.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [14]:
fossil_df = pd.read_csv("../Data/FossilTNRS.csv")
fossils = pd.read_csv("../Data/fossil_tax_unit.tsv")
mol = pd.read_csv("../test.csv")

phylo_dat = dendropy.StandardCharacterMatrix.get_from_path("../Data/AntMegaMatrix.nex", schema="nexus", preserve_underscores=True)
extant_df = pd.read_csv("../Data/morphTNRS.csv")

In [11]:
names_df

Unnamed: 0,taxon,genus,subfamily
121,Acanthoponera_minor,Acanthoponera,Heteroponerinae
73,Aneuretus,Aneuretus,Aneuretinae
160,Aneuretus_simoni,Aneuretus,Aneuretinae
113,Apomyrma_stygia,Apomyrma,Apomyrminae
39,Brownimecia_clavata,Brownimecia,Brownimeciinae
163,Camelomecia,Camelomecia,
267,Formica_fusca,Formica,Formicinae
758,Gerontoformica_gracilis,Gerontoformica,Sphecomyrinae
759,Gerontoformica_magnus,Gerontoformica,Sphecomyrinae
763,Gerontoformica_pilosus,Gerontoformica,Sphecomyrinae
