In [1]:
import dendropy
import pandas as pd
import argparse
from random import randint

In [326]:
def pull_data_taxon(fossil_df, **kwargs):
    '''Pull the oldest fossil in a group. Mandatory: what level (i.e., subfamily, tribe, etc).'''
    foss_list = []
    for key, value in kwargs.items():
        try:
          kwargs["level"]
        except KeyError:
          raise KeyError('level is required is a Required Argument that tells the program from which \\
                         taxonomic group to sample. Options include subfamily, tribe, genus')  
        try:
          kwargs["age"]
        except KeyError:
          raise KeyError('age is required is a Required Argument that specifies how to sample \\
                          within a taxonomic group. Options include oldest, youngest, random')          
        if key == "level":
            group_key = value.lower()
        if key == "age":
            age_key = value.lower()
            if age_key == "oldest":
                if "fraction" in kwargs.keys():
                    num_key = kwargs["fraction"]
                    oldest_df = fossil_df.groupby(group_key).apply(lambda x: x.nlargest(int(len(x) * num_key), 'max_ma'))[["taxon","max_ma", group_key]]
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df = oldest_df.drop(group_key, axis=1) 
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)
                else:
                    oldest_df = fossil_df.groupby([group_key]).max()[["max_ma", "taxon"]]
                    oldest_df = oldest_df.reset_index()
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df = oldest_df.drop(group_key, axis=1) 
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)
            elif age_key == "youngest":
                if "fraction" in kwargs.keys():
                    num_key = kwargs["fraction"]
                    oldest_df = fossil_df.groupby(group_key).apply(lambda x: x.nsmallest(int(len(x) * num_key), 'max_ma'))[["taxon","max_ma", group_key]]
                    #                    oldest_df = oldest_df.reset_index()
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df = oldest_df.drop(group_key, axis=1) 
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)
                else:
                    oldest_df = fossil_df.groupby([group_key]).min()[["max_ma", "taxon"]]
                    oldest_df = oldest_df.reset_index()
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df = oldest_df.drop(group_key, axis=1) 
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)
            elif age_key == "random":                    
                if "fraction" in kwargs.keys():
                    num_key = kwargs["fraction"]
                    oldest_df = fossil_df.groupby(group_key).apply(lambda x: x.sample(frac=num_key))[["taxon","max_ma", group_key]]
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df = oldest_df.drop(group_key, axis=1) 
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)                    
                else:
                    oldest_df = fossil_df.groupby(group_key).apply(lambda x: x.sample(1))[["max_ma", "taxon"]]
                    oldest_df.to_csv("../Data/fossil_tax_unit.tsv", sep="\t", index=False)
                    oldest_df[["taxon","age"]] = oldest_df[["taxon","max_ma"]]
                    oldest_df = oldest_df.drop("max_ma", axis=1) 
                    oldest_df.to_csv("../Data/fossil_sample.tsv", sep="\t", index=False)

In [327]:
pull_data_taxon(fossil_df, level="genus", age="oldest", fraction=.5)

In [483]:
def pull_data_sampling(fossil_df, **kwargs):
    '''Pull fossils relative to time. Mandatory keyword: Strategy. Options: uniform (freq=int), diversified, time
    stratified. If time-stratified, must also provide a list of time bins (with time_bins = list or df)'''
    oldest = float(fossil_df[["max_ma"]].max())
    assert len(kwargs.items()) > 0, "No required args provided. Must provide sampling strategy. \
                                     Options: uniform, diversified, time stratified."
    for key, value in kwargs.items():
        try:
          kwargs["strategy"]
        except KeyError:
          raise KeyError('strategy is a Required Argument that tells the program how to sample fossils through time. \\
                          Options: uniform, diversified, time stratified. If time-stratified, provide a list of bins.')  
        if "time-stratified" in kwargs.values():
            try:
                kwargs["time_bins"]
            except KeyError:
                raise KeyError('For time-binned sampling, time bins must be specified withe the time_bins kwarg. \
                                Input may be a list of lists specifying sampling, or a dataframe of time bins') 
        if key == "strategy":
            type_key = value.lower()
        if type_key == "uniform":
            if "freq" in kwargs.keys():
                samp_freq = kwargs["freq"]
            else:
                samp_freq = .1
    bin = float(oldest)*samp_freq
    num_bins = round(oldest/bin)
    bottom_interval = oldest - bin
    l = pd.DataFrame()
    for x in range(0,num_bins):
        bottom_interval = oldest - (bin*(x+1))
        top_interval = oldest - (bin*x)
        tmp_df = fossil_df[(fossil_df['max_ma'] >= bottom_interval) & (fossil_df['max_ma'] <= top_interval)]
        if len(tmp_df) > 1:
            if "number" in kwargs:
                numb = kwargs["number"]
                if len(tmp_df) >= numb:
                    l = l.append(tmp_df.sample(numb))
                else:
                    l = l.append(tmp_df.sample(len(tmp_df)))
            else: 
                l = l.append(tmp_df.sample(1)) 
        else:
            pass
    l = l.drop(['notes', 'reference_no', 'tribe', 'min_ma', 'fossil'], axis=1)
    l[["taxon","age", "subfamily", "genus"]] = l[["taxon","max_ma", "subfamily", "genus"]]
    l.to_csv("../Data/fossil_tax_unit.tsv", index=False)
    l = l.drop(['subfamily', 'genus','max_ma'], axis=1)
    l.to_csv("../Data/fossil_sample.tsv", index=False)

In [484]:
pull_data_sampling(fossil_df, strategy="uniform", freq=.1, number=3)

In [None]:
Make clade constraints

In [None]:
if __name__ == "__main__":
	parser = argparse.ArgumentParser()

	parser.add_argument("--fossil", help="Path to the csv of fossils.")
	parser.add_argument("--ages", help="Path to data ages TSV or CSV file containing \
	ages of non-contemporaneous tips, if any exist in your analysis.")
	parser.add_argument("--sample", help="How to sample fossils. Options: oldest, proportional, sampling.")
	parser.add_argument("--output", help="Path to where you'd like to write output")
	args = parser.parse_args()
	if args.set:
		df = args.set
	if args.ages:
		tnrs = args.ages
	if args.output:
		outfile = args.output		
		
	foss_tax = parse_dataframe(df)

In [4]:
fossil_df = pd.read_csv("../Data/FossilTNRS.csv")