In [1]:
import pandas as pd
import dendropy
import argparse

In [36]:
def parse_dataframe(df):
    '''Retrieve a taxon list from a dataframe'''
    if df.endswith('.tsv'):
        df = pd.read_csv(df, delimiter="\t")
    elif df.endswith('.csv'):
        df = pd.read_csv(df)
    tax_list= df[['taxon','max']]
    return(tax_list)

In [228]:
def parse_morphology(morph_mat):
    '''Retrieve a taxon list from a Nexus-Formatted morphological matrix'''
    mm = dendropy.StandardCharacterMatrix.get_from_path(morph_mat, schema="nexus", preserve_underscores=True)
    ns = mm.taxon_namespace
    return(ns)

In [328]:
def parse_molecular(mol_mat):
    '''Retrieve a taxon list from a molecular matrix'''
    if mol_mat.endswith('.nex'):
        molm = dendropy.DnaCharacterMatrix.get_from_path(mol_mat, schema="nexus", preserve_underscores=True )
    elif mol_mat.endswith('.fasta' or '.fa'):
        molm = dendropy.DnaCharacterMatrix.get_from_path(mol_mat, schema="fasta")
    else:
        print("Could not tell what file format molecular data are in. Please use suffixes .fa, .fasta, or .nex")
    mns = molm.taxon_namespace
    new_mns = []
    for item in mns:
        new_mns.append(str(item).replace("'", ""))
    df = pd.DataFrame({'taxon':new_mns})
    df = df.drop_duplicates()
    df['age'] = 0
    return(df)

In [329]:
def map_fossils(tnrs, ns):
    '''Decide which taxa in the morphology are fossils, and which are extant'''
    dict_of_nameages = {}
    if tnrs.endswith('.tsv'):
        tnrs = pd.read_csv(tnrs, delimiter="\t")
    elif tnrs.endswith('.csv'):
        tnrs = pd.read_csv(tnrs)
    for item in ns:
        item = str(item).replace("'", "")
        if len(tnrs.loc[tnrs['taxon'] == item]) != 0:
            location = tnrs.loc[tnrs['taxon'] == item]
            dict_of_nameages[str(item)] = location.max_ma.item()
        else:
            dict_of_nameages[item] = 0
    fossil_df = pd.DataFrame.from_dict(dict_of_nameages, orient='index')
    fossil_df = fossil_df.reset_index()
    fossil_df.columns=['taxon','age']
    return(fossil_df)

In [330]:
morph_mat = "../Data/Morph/Data/AntMegaMatrixMinusAmbig.nex"
mol_mat = "../Data/Mol/Data/18s.nex.fasta"
df = "../Teaching/RK_analysis/Data/Tribe_max.tsv"
morphology_tnrs = "../Data/Morph/FossilTNRS.csv"

In [331]:
t_l = parse_dataframe(df)
ns = parse_morphology(morph_mat)
molm =  parse_molecular(mol_mat)
foss =  map_fossils(morphology_tnrs, ns)

                           taxon     age
0            Acanthoponera_minor    0.00
1            Amblyopone_pallipes    0.00
2                      Aneuretus    0.00
3                Anomalomyrma_sp    0.00
4                Apomyrma_stygia    0.00
5            Brownimecia_clavata   93.90
6                    Camelomecia   99.60
7           Cerapachys_sexspinus    0.00
8         Chalybion_californicum    0.00
9             Chyphotes_mellipes    0.00
10                 Formica_fusca   15.97
11       Gerontoformica_gracilis   99.60
12         Gerontoformica_magnus   99.60
13        Gerontoformica_pilosus   99.60
14       Gerontoformica_spiralis   99.60
15        Haidomyrmex_scimitarus   99.60
16       Haidomyrmodes_mammuthus  105.30
17          Haidoterminus_cippus   83.50
18                    Heterogyna    0.00
19            Hypoponera_opacior    0.00
20                     Kyromyrma    0.00
21           Lasius_californicus    0.00
22              Leptanilla_swani    0.00
23         Lepta

In [337]:
big_matr = pd.concat([foss, molm])
big_matr = big_matr.drop_duplicates()

In [338]:
big_matr.to_csv("check", index=False, sep='\t')