In [31]:
import pandas as pd

In [32]:
metafile="../../data/Arabidopsis_metadata.tsv"
ttfile="../../data/all_tissue_type.csv"
filtered_metafile="../../data/metadata_UMR75.csv"
tissue_mapfile="../../data/tissue_type_map_UMR75.csv"
thresh=0.75

In [33]:
mdf = pd.read_csv(metafile, sep="\t")
mdf.replace("/", "Other", inplace=True)
temp = mdf["UniqueMappedRate"].str.rstrip("%").astype(float)/100.
mdf["UniqueMappedRate"] = temp
mdf.head(3)

Unnamed: 0,Sample,Project,SampleName,PMID,Genotype,Ecotype,Tissue,TotalReads,UniqueMappedRate,ReleaseDate
0,DRX007662,PRJDB2180,Arabidopsis WT-Col mRNA_seq,23934508,wild type,Col-0,Other,30664389,0.862,4/2/2014
1,DRX007663,PRJDB2180,Arabidopsis ibm1-4 mRNA_seq,23934508,ibm1-4,Col-0,Other,38551905,0.911,4/2/2014
2,DRX007664,PRJDB2180,Arabidopsis ibm2-2 mRNA_seq,23934508,ibm2-2,Col-0,Other,37223057,0.834,4/2/2014


In [34]:
# Filter by UniqueMappedRate and write to file
mdf_filtered = mdf[mdf["UniqueMappedRate"] >= thresh]
print(f"shape before filtering: {mdf.shape}\n")
print(f"shape after filtering: {mdf_filtered.shape}")

shape before filtering: (28164, 10)

shape after filtering: (19415, 10)


In [35]:
# Create file for Unique tissue labels
res = mdf["Tissue"].value_counts()
tissues = res.keys()
counts = res.ravel()
tdf = pd.DataFrame(list(zip(tissues, counts)), columns=["Tissue", "Count"])
print(f"Number of unique tisue labels: {tdf.shape[0]}")
print(f"tdf columns: {tdf.columns}")

Number of unique tisue labels: 333
tdf columns: Index(['Tissue', 'Count'], dtype='object')


In [36]:
# Add VegetativeRepro, AboveBelow labels and write to file
tdf_old = pd.read_csv(ttfile, skipinitialspace=True).fillna("Unknown")
tdf_old.replace("/", "Other", inplace=True)
print(f"tdf_old columns: {tdf_old.columns}")
tdf_old.head(2)

tdf_old columns: Index(['Tissue', 'Counts', 'Tissue.1', 'VegetativeRepro', 'AboveBelow',
       'Debatable'],
      dtype='object')


Unnamed: 0,Tissue,Counts,Tissue.1,VegetativeRepro,AboveBelow,Debatable
0,seedlings,4402,Seedling,WholePlant,WholePlant,Unknown
1,leaves,3633,Leaf,Vegetative,Above,Unknown


In [37]:
tdf_new = tdf.merge(tdf_old, on="Tissue")
tdf_new = tdf_new.drop(["Counts", "Debatable"], axis=1)
print(f"tdf_new columns {tdf_new.columns}")
tdf_new.head(2)

# tdf_new = tdf_new.rename(columns={"Tissue_Corrected":"Tissue"})
tdf_new.to_csv(tissue_mapfile, index=False)

tdf_new columns Index(['Tissue', 'Count', 'Tissue.1', 'VegetativeRepro', 'AboveBelow'], dtype='object')


In [38]:
# Left Join mdf_filtered and tdf_new on Tissue and write to file
mdf_new = mdf_filtered.merge(tdf_new, how="left",
                                on="Tissue").drop(["Count"], axis=1)
mdf_new = mdf_new.rename(columns={"Sample":"SampleID"})
print(mdf_new.shape)
print(mdf_new.columns)

(19415, 13)
Index(['SampleID', 'Project', 'SampleName', 'PMID', 'Genotype', 'Ecotype',
       'Tissue', 'TotalReads', 'UniqueMappedRate', 'ReleaseDate', 'Tissue.1',
       'VegetativeRepro', 'AboveBelow'],
      dtype='object')


In [39]:
# mdf_new["VegetativeRepro"].replace(to_replace="Root",
#                                     value="Vegetative", inplace=True)
# mdf_new["VegetativeRepro"].replace(to_replace="Hypotocyl",
#                                     value="Vegetative", inplace=True)

# mdf_new["AboveBelow"].replace(to_replace="Seed",
#                                     value="WholePlant", inplace=True)

print(f'AboveBelow Classes: {mdf_new["AboveBelow"].value_counts()}')
print(f'VegetativeRepro Classes:\
        {mdf_new["VegetativeRepro"].value_counts()}')

mdf_new.to_csv(filtered_metafile, sep=",", index=False)

AboveBelow Classes: Above         8855
WholePlant    6692
Below         2313
Unknown       1058
Seed           497
Name: AboveBelow, dtype: int64
VegetativeRepro Classes:        Vegetative      8018
WholePlant      6692
Root            2062
Reproductive    1365
Unknown         1027
Hypotocyl        251
Name: VegetativeRepro, dtype: int64
