In [3]:
import dendropy
import pandas as pd
import argparse
from random import randint

In [326]:
def pull_data_taxon(fossil_df, **kwargs):
    '''Pull the oldest fossil in a group. Mandatory: what level (i.e., subfamily, tribe, etc).'''
    foss_list = []
    for key, value in kwargs.items():
        try:
          kwargs["level"]
        except KeyError:
          raise KeyError('level is required is a Required Argument that tells the program from which \\
                         taxonomic group to sample. Options include subfamily, tribe, genus')  
        try:
          kwargs["age"]
        except KeyError:
          raise KeyError('age is required is a Required Argument that specifies how to sample \\
                          within a taxonomic group. Options include oldest, youngest, random')          
        if key == "level":
            group_key = value.lower()
        if key == "age":
            age_key = value.lower()
            if age_key == "oldest":
                if "fraction" in kwargs.keys():
                    num_key = kwargs["fraction"]
                    oldest_df = fossil_df.groupby(group_key).apply(lambda x: x.nlargest(int(len(x) * num_key), 'max_ma'))[["taxon","max_ma", group_key]]
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df = oldest_df.drop(group_key, axis=1) 
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)
                else:
                    oldest_df = fossil_df.groupby([group_key]).max()[["max_ma", "taxon"]]
                    oldest_df = oldest_df.reset_index()
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df = oldest_df.drop(group_key, axis=1) 
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)
            elif age_key == "youngest":
                if "fraction" in kwargs.keys():
                    num_key = kwargs["fraction"]
                    oldest_df = fossil_df.groupby(group_key).apply(lambda x: x.nsmallest(int(len(x) * num_key), 'max_ma'))[["taxon","max_ma", group_key]]
                    #                    oldest_df = oldest_df.reset_index()
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df = oldest_df.drop(group_key, axis=1) 
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)
                else:
                    oldest_df = fossil_df.groupby([group_key]).min()[["max_ma", "taxon"]]
                    oldest_df = oldest_df.reset_index()
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df = oldest_df.drop(group_key, axis=1) 
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)
            elif age_key == "random":                    
                if "fraction" in kwargs.keys():
                    num_key = kwargs["fraction"]
                    oldest_df = fossil_df.groupby(group_key).apply(lambda x: x.sample(frac=num_key))[["taxon","max_ma", group_key]]
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df = oldest_df.drop(group_key, axis=1) 
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)                    
                else:
                    oldest_df = fossil_df.groupby(group_key).apply(lambda x: x.sample(1))[["max_ma", "taxon"]]
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)

In [327]:
pull_data_taxon(fossil_df, level="genus", age="oldest", fraction=.5)

In [63]:
def pull_data_sampling(fossil_df, **kwargs):
    '''Pull fossils relative to time. Mandatory keyword: Strategy. Options: uniform (freq=int), time_dep, time
    stratified. If time-stratified, must also provide a list of time bins (with time_bins = list or df)'''
    oldest = float(fossil_df[["max_ma"]].max())
    assert len(kwargs.items()) > 0, "No required args provided. Must provide sampling strategy. \
                                     Options: uniform, diversified, time stratified."
    for key, value in kwargs.items():
        try:
          kwargs["strategy"]
        except KeyError:
          raise KeyError('strategy is a Required Argument that tells the program how to sample fossils through time. \\
                          Options: uniform, time_dep, time stratified. If time-stratified, provide a list of bins.')  
        if "time-stratified" in kwargs.values():
            try:
                kwargs["time_bins"]
            except KeyError:
                raise KeyError('For time-binned sampling, time bins must be specified withe the time_bins kwarg. \
                                Input may be a list of lists specifying sampling, or a dataframe of time bins') 
        if key == "strategy":
            type_key = value.lower()
        if type_key == "uniform":
            if "freq" in kwargs.keys():
                samp_freq = kwargs["freq"]
            else:
                samp_freq = .1
                print("Uniform sampling indicated, but no sliding window width. Will assume window is 10% of  \\
                      age of oldest fossil")
        if type_key == "time-dep":
            if "multiplier" in kwargs.keys():
                multi = kwargs["multiplier"]
            else:
                print("Time dependent sampling indicated, but no multiplier. Will assume sampling frequency \\
                      increases 10% each time bin towards the present")
                samp_freq = 1.1
            if "freq" in kwargs.keys():
                samp_freq = kwargs["freq"]
            else:
                samp_freq = .1
                print("Time dependent sampling indicated, but no sliding window width. Will assume window is 10% of  \\
                      age of oldest fossil")
    bin = float(oldest)*samp_freq
    num_bins = round(oldest/bin)
    bottom_interval = oldest - bin
    l = pd.DataFrame()
    
    if type_key == "uniform":

        for x in range(0,num_bins):
            bottom_interval = oldest - (bin*(x+1))
            top_interval = oldest - (bin*x)
            tmp_df = fossil_df[(fossil_df['max_ma'] >= bottom_interval) & (fossil_df['max_ma'] <= top_interval)]
            if len(tmp_df) > 1:
                if "number" in kwargs:
                    numb = kwargs["number"]
                    if len(tmp_df) >= numb:
                        l = l.append(tmp_df.sample(numb))
                    else:
                        l = l.append(tmp_df.sample(len(tmp_df)))
                else: 
                    l = l.append(tmp_df.sample(1)) 
            else:
                pass
            
    if type_key == "time-dep":
        for x in range(0,num_bins):

            bottom_interval = oldest - (bin*(x+1))
            top_interval = oldest - (bin*x)
            tmp_df = fossil_df[(fossil_df['max_ma'] >= bottom_interval) & (fossil_df['max_ma'] <= top_interval)]
            if len(tmp_df) > 1:
                if multi > 1:
                    multi = 1
                numb = round(len(tmp_df)*multi)
                l = l.append(tmp_df.sample(numb))
            else:
                pass      
            multi = multi * (1 + multi)
            
    l = l.drop(['notes', 'reference_no', 'tribe', 'min_ma', 'fossil'], axis=1)
    l[["taxon","age", "subfamily", "genus"]] = l[["taxon","max_ma", "subfamily", "genus"]]
    l.to_csv("../Data/fossil_tax_unit.tsv", index=False)
    l = l.drop(['subfamily', 'genus','max_ma'], axis=1)
    l.to_csv("../Data/fossil_sample.tsv", index=False)

In [64]:
pull_data_sampling(fossil_df, strategy="time-dep", multiplier = .05, freq = .1)

In [211]:
def make_combined_data(fossil_df, phylo_dat, fossils, extant_df):
    names_taxon = pd.DataFrame()
    tree_names = phylo_dat.taxon_namespace
    for name in tree_names.labels():
        if name in fossil_df.taxon.values: 
            names_taxon = names_taxon.append(fossil_df[fossil_df['taxon'].str.contains(name)])
        elif name not in fossil_df.taxon.values:
            names_taxon = names_taxon.append(extant_df[extant_df['taxon'].str.contains(name)])
        else:
            names_taxon = names_taxon.append(name)
            print("{} is not contained in any morphology file, added without data".format(name))  
    names_taxon = names_taxon[["taxon", "genus", "subfamily"]]
    return(names_taxon)

In [212]:
names_df = make_combined_data(fossil_df, phylo_dat, fossils, extant_df)

In [354]:
def make_taxon_set(names_df, **kwargs):
    n_l = []
    nl_dict = {}
    assert len(kwargs.items()) > 0, "No required args provided. Must provide taxonomic level to construct taxon sets"
    for key, value in kwargs.items():
        try:
          kwargs["level"]
        except KeyError:
          raise KeyError('level is a Required Argument that tells the program how to construct clade contraints')  
    if key == "level":
        group_key = str(value.lower()) 
        n_l = names_df[group_key].unique()
        results = names_df.groupby(group_key)['taxon']
    for name in n_l:
#        print(results.get_group(name).tolist())
        nl_dict[name] = results.get_group(name).tolist()
    for key, value in nl_dict.items():
        print(key, "= clade(",'','","'.join(value),"\")")

In [355]:
make_taxon_set(names_df, level = "subfamily")

Heteroponerinae = clade(  Acanthoponera_minor","Heteroponera_relicta ")
Aneuretinae = clade(  Aneuretus","Aneuretus_simoni","Aneuretus_simoni ")
Apomyrminae = clade(  Apomyrma_stygia ")
Brownimeciinae = clade(  Brownimecia_clavata ")
Incertae = clade(  Camelomecia","Camelomecia_janovitzi","Myanmyrma_gracilis ")
Formicinae = clade(  Formica_fusca","Kyromyrma","Lasius_californicus","Gesomyrmex_luzonensis","Oecophylla_smaragdina ")
Sphecomyrinae = clade(  Gerontoformica_gracilis","Gerontoformica_magnus","Gerontoformica_pilosus","Gerontoformica_spiralis","Haidomyrmex_scimitarus","Haidomyrmodes_mammuthus","Haidoterminus_cippus ")
Ponerinae = clade(  Hypoponera_opacior","Leptogenys_diminuta","Platythyrea_punctata","Platythyrea_turneri","Anochetus_emarginatus","Odontomachus_bauri","Asphinctopone_silvestrii","Belonopelta_deletrix","Centromyrmex_brachycola","Cryptopone_gilva","Diacamma_ceylonense","Dinoponera_lucida","Dolioponera_fustigera","Emeryopone_buttelreepeni","Harpegnathos_saltator","Le

In [182]:
if __name__ == "__main__":
	parser = argparse.ArgumentParser()

	parser.add_argument("--fossil", help="Path to the csv of fossils.")
	parser.add_argument("--ages", help="Path to data ages TSV or CSV file containing \
	ages of non-contemporaneous tips, if any exist in your analysis.")
	parser.add_argument("--sample", help="How to sample fossils. Options: oldest, proportional, sampling.")
	parser.add_argument("--output", help="Path to where you'd like to write output")
	args = parser.parse_args()
	if args.set:
		df = args.set
	if args.ages:
		tnrs = args.ages
	if args.output:
		outfile = args.output		
		
	foss_tax = parse_dataframe(df)

usage: __main__.py [-h] [--fossil FOSSIL] [--ages AGES] [--sample SAMPLE]
                   [--output OUTPUT]
__main__.py: error: unrecognized arguments: -f /Users/april/Library/Jupyter/runtime/kernel-d687cf77-a22f-42d5-a29a-6963438ecc4d.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [98]:
fossil_df = pd.read_csv("../Data/FossilTNRS.csv")

In [103]:
fossils = pd.read_csv("../Data/fossil_tax_unit.tsv")

In [78]:
phylo_dat = dendropy.StandardCharacterMatrix.get_from_path("../Data/AntMegaMatrix.nex", schema="nexus", preserve_underscores=True)

In [210]:
extant_df = pd.read_csv("../Data/MorphTNRS.csv")

In [97]:
fossil_df

Unnamed: 0_level_0,reference_no,subfamily,tribe,genus,fossil,min_ma,max_ma,notes
taxon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Agroecomyrmex_duisburgi,44414,Agroecomyrmecinae,,Agroecomyrmex,Yes,33.900,38.00,Note
Eulithomyrmex_rugosus,4971,Agroecomyrmecinae,,Eulithomyrmex,Yes,33.900,37.20,Note
Eulithomyrmex_striatus,4971,Agroecomyrmecinae,,Eulithomyrmex,Yes,33.900,37.20,Note
Amblyoponinae,59088,Amblyoponinae,,,Yes,20.440,23.03,Note
Amblyoponini,53165,Amblyoponinae,Amblyoponini,Amblyoponini,Yes,20.440,23.03,Note
Casaleia_eocenica,43781,Amblyoponinae,,Casaleia,Yes,40.400,48.60,Note
Casaleia_inversa,41286,Amblyoponinae,,Casaleia,Yes,5.333,15.97,Note
Casaleia_longiventris,37344,Amblyoponinae,,Casaleia,Yes,11.608,12.70,Note
Casaleia_orientalis,56094,Amblyoponinae,,Casaleia,Yes,33.900,38.00,Note
Myopopone_sinensis,16579,Ambyloponinae,,Myopopone,Yes,11.608,15.97,Note
