In [2]:
import pandas as pd

import xml.etree.ElementTree as ET

def get_attr(attributes, name):
    for attr in attributes.findall('Attribute'):
        if attr.attrib.get('attribute_name') == name:
            return eval(attr.text)
    return None

def get_ers_id(ids_elem):
    for id_elem in ids_elem.findall('Id'):
        if id_elem.attrib.get('db') == 'SRA':
            return id_elem.text
    return None

def get_data(xml):
    tree = ET.parse(xml)
    root = tree.getroot()
    
    data = []
    # Extract data
    for sample in root.findall('BioSample'):
        ers = get_ers_id(sample.find('Ids'))
        attrs = sample.find('Attributes')
    
        lat = get_attr(attrs, 'latitude')
        lon = get_attr(attrs, 'longitude')
        lon = lon if lon < 0 else -lon
        ph = get_attr(attrs, 'ph')
        nitrogen = get_attr(attrs, "tot_nitro")
        carbon = get_attr(attrs, "tot_org_carb")
        water = get_attr(attrs, "water_content_soil")
        
        data.append([ers, lon, lat, ph, nitrogen, carbon, water])
    
        # print(f'{ers=}, {lat=}, {lon=}, {ph=}, {nitrogen=}, {carbon=}, {water=}')
        
    return data


In [3]:
data = get_data('/Users/cp68wp/Downloads/biosample_result.xml')
df = pd.DataFrame(data, columns=["sample", "longitude", "latitude", "pH", "nitrogen", "carbon", "water"])
df = df[~df[["longitude", "latitude"]].duplicated()]
df = df.set_index("sample")

In [4]:
Y = pd.read_csv("/Users/cp68wp/Downloads/20250418_selection_ERP006161.tsv", sep="\t")
Y["sample"] = Y["sample"].str.split(".").str[-1]
Y = Y.groupby("sample").mean()

intersection = list(set.intersection(set(df.index), set(Y.index)))

In [38]:
df = df[df.index.isin(intersection)].sort_index()
Y = Y[Y.index.isin(intersection)].sort_index()

In [41]:
coords = df[["longitude", "latitude"]]
X = df[["pH", "nitrogen", "carbon", "water"]]

In [43]:
folder = "/Users/cp68wp/Documents/GitHub/Biogeography/data/central_park/"
Y.to_csv(folder + "Y.csv")
X.to_csv(folder + "X.csv")
coords.to_csv(folder + "XY.csv")

In [5]:
Y.columns

Index(['90_185;96_16405;97_20124;98_25148;99_34011',
       '90_216;96_15732;97_19276;98_24071;99_64198',
       '90_3105;96_30279;97_57620;98_76387;99_112705',
       '90_15845;96_20525;97_25540;98_32388;99_44643',
       '90_2158;96_43284;97_55396;98_98858;99_149512',
       '90_1518;96_32731;97_41456;98_54108;99_77453',
       '90_17850;96_69182;97_89870;98_122195;99_188475',
       '90_816;96_25073;97_78740;98_106385;99_161692',
       '90_10523;96_45814;97_58791;98_78067;99_115470',
       '90_372;96_5501;97_6633;98_8238;99_34284',
       ...
       '90_10;96_43344;97_55471;98_114950;99_175741',
       '90_10583;96_46061;97_59102;98_78474;99_116106',
       '90_976;96_57977;97_75022;98_101091;99_153194',
       '90_15798;96_5835;97_22064;98_27710;99_37686',
       '90_1451;96_6216;97_7472;98_9205;99_167561',
       '90_5783;96_25829;97_32396;98_41532;99_58334',
       '90_142;96_7953;97_17569;98_21830;99_43178',
       '90_19736;96_80737;97_103907;98_139646;99_210799',
       '90_