In [2]:
import biom

# Load the BIOM table
biom_table = biom.load_table('cleaned_asv_reads.biom')

# Get the list of feature (ASV) IDs from the BIOM table
biom_feature_ids = biom_table.ids(axis='observation')

print(f"Number of feature IDs in BIOM: {len(biom_feature_ids)}")


Number of feature IDs in BIOM: 11814


In [1]:
from ete3 import Tree

# Load the Newick tree file
tree = Tree('fasttree_output.nwk')

# Get the list of tip names (feature IDs) from the tree
tree_tip_names = [leaf.name for leaf in tree.get_leaves()]

print(f"Number of tips (feature IDs) in phylogenetic tree: {len(tree_tip_names)}")


Number of tips (feature IDs) in phylogenetic tree: 12462


In [4]:
import biom

# Load the BIOM table
biom_table = biom.load_table('cleaned_asv_reads.biom')

# Get the list of feature (ASV) IDs from the BIOM table
biom_feature_ids = biom_table.ids(axis='observation')

# Convert both lists to sets for easy comparison
biom_feature_ids_set = set(biom_feature_ids)
tree_tip_names_set = set(tree_tip_names)

# Features in the BIOM table but not in the phylogenetic tree
missing_in_tree = biom_feature_ids_set - tree_tip_names_set
print(f"Number of features in BIOM but missing in tree: {len(missing_in_tree)}")
#print("Features missing in tree:", missing_in_tree)

# Features in the phylogenetic tree but not in the BIOM table
missing_in_biom = tree_tip_names_set - biom_feature_ids_set
print(f"Number of tips in tree but missing in BIOM: {len(missing_in_biom)}")
#print("Features missing in BIOM:", missing_in_biom)


Number of features in BIOM but missing in tree: 964
Number of tips in tree but missing in BIOM: 1612


In [7]:
import pandas as pd

df = pd.read_csv("../../FINAL_UNASSIGNED_RAW.csv")

In [11]:
df_feature_ids = df.iloc[:,1:-13].columns

In [13]:
# Convert all feature ID lists to sets for comparison
biom_feature_ids_set = set(biom_feature_ids)
tree_tip_names_set = set(tree_tip_names)
df_feature_ids_set = set(df_feature_ids)

# Check if all BIOM features are in df
missing_in_df_from_biom = biom_feature_ids_set - df_feature_ids_set
print(f"Number of BIOM features missing in df: {len(missing_in_df_from_biom)}")
#print(missing_in_df_from_biom)

# Check if all tree tips are in df
missing_in_df_from_tree = tree_tip_names_set - df_feature_ids_set
print(f"Number of tree tips missing in df: {len(missing_in_df_from_tree)}")
#print(missing_in_df_from_tree)

# Check if there are extra features in df that are not in either the BIOM or tree
extra_in_df = df_feature_ids_set - (biom_feature_ids_set | tree_tip_names_set)
print(f"Number of extra features in df (not in BIOM or tree): {len(extra_in_df)}")
#print(extra_in_df)



Number of BIOM features missing in df: 0
Number of tree tips missing in df: 0
Number of extra features in df (not in BIOM or tree): 153


In [14]:
# Check for features that have zero counts in df
zero_abundance_features = df.iloc[:, 1:-13].sum(axis=0) == 0
print("Number of features with zero counts:", zero_abundance_features.sum())


Number of features with zero counts: 1765


In [1]:
# Extract feature IDs from the FASTA file
from Bio import SeqIO

fasta_file = "2_feature_rep_seq.fasta"
fasta_feature_ids = [record.id for record in SeqIO.parse(fasta_file, "fasta")]

print(f"Number of feature IDs in FASTA: {len(fasta_feature_ids)}")
print(f"First 5 feature IDs in FASTA: {fasta_feature_ids[:5]}")


Number of feature IDs in FASTA: 13579
First 5 feature IDs in FASTA: ['d0ab2c15400fe710288526c9a33083fb', '21b04aaabca0f92a5ba0a0b2eaa31aea', '0b0a4bdddb823efeb9984bf862fe81f8', 'c13359a9f895b1a2b9fcf49ceaea0e8a', '0c870b228c94b71c91663a829ef6f119']


In [2]:
import biom

# Load the BIOM table
biom_table = biom.load_table("cleaned_asv_reads.biom")

# Get the list of feature (ASV) IDs from the BIOM file
biom_feature_ids = biom_table.ids(axis='observation')

print(f"Number of feature IDs in BIOM: {len(biom_feature_ids)}")
print(f"First 5 feature IDs in BIOM: {biom_feature_ids[:5]}")


Number of feature IDs in BIOM: 11814
First 5 feature IDs in BIOM: ['d0ab2c15400fe710288526c9a33083fb' '21b04aaabca0f92a5ba0a0b2eaa31aea'
 '0b0a4bdddb823efeb9984bf862fe81f8' 'c13359a9f895b1a2b9fcf49ceaea0e8a'
 '0c870b228c94b71c91663a829ef6f119']


In [3]:
# Convert the lists of feature IDs to sets for easy comparison
biom_feature_ids_set = set(biom_feature_ids)
fasta_feature_ids_set = set(fasta_feature_ids)

# Check for missing features between BIOM and FASTA
missing_in_fasta = biom_feature_ids_set - fasta_feature_ids_set
missing_in_biom = fasta_feature_ids_set - biom_feature_ids_set

print(f"Number of features in BIOM but missing in FASTA: {len(missing_in_fasta)}")
print(f"Number of features in FASTA but missing in BIOM: {len(missing_in_biom)}")

if missing_in_fasta:
    print("Features missing in FASTA:", list(missing_in_fasta)[:5])  # Show first 5 missing features

if missing_in_biom:
    print("Features missing in BIOM:", list(missing_in_biom)[:5])  # Show first 5 missing features


Number of features in BIOM but missing in FASTA: 0
Number of features in FASTA but missing in BIOM: 1765
Features missing in BIOM: ['19328c951805383902ac2b3bc1fba6cc', 'fb38adbf606b98eeece28959426158a4', '9ed2882583b59fb263020077488d7286', '5ebc9e5b8b092320e813853dfc37dfeb', '9c14f8183db54483322050f3a4ca978b']


In [4]:
from ete3 import Tree

# Load the Newick tree file
tree = Tree('fasttree_output.nwk')

# Get the list of tip names (feature IDs) from the tree
tree_tip_names = [leaf.name for leaf in tree.get_leaves()]

print(f"Number of tips (feature IDs) in the tree: {len(tree_tip_names)}")
print(f"First 5 tip names (feature IDs): {tree_tip_names[:5]}")


Number of tips (feature IDs) in the tree: 12462
First 5 tip names (feature IDs): ['9a85d30febaa394c1083dd687ed757c1', '3893a582adcc0d8b8fbb5e02c85765e0', 'c91e9e2a3188bec6b956dc6adc655b4c', '1c592a2262b8475753c43c78d3f4dc2a', '61d08ffa009e0e36d9a5474c44f0b11f']


In [5]:
# Convert the list of tip names from the tree to a set
tree_tip_names_set = set(tree_tip_names)

# Check for missing features between BIOM and the tree
missing_in_tree = biom_feature_ids_set - tree_tip_names_set
missing_in_biom_from_tree = tree_tip_names_set - biom_feature_ids_set

print(f"Number of features in BIOM but missing in tree: {len(missing_in_tree)}")
print(f"Number of tips in tree but missing in BIOM: {len(missing_in_biom_from_tree)}")

# Check for consistency between FASTA and the tree
missing_in_tree_from_fasta = fasta_feature_ids_set - tree_tip_names_set
missing_in_fasta_from_tree = tree_tip_names_set - fasta_feature_ids_set

print(f"Number of features in FASTA but missing in tree: {len(missing_in_tree_from_fasta)}")
print(f"Number of tips in tree but missing in FASTA: {len(missing_in_fasta_from_tree)}")


Number of features in BIOM but missing in tree: 964
Number of tips in tree but missing in BIOM: 1612
Number of features in FASTA but missing in tree: 1117
Number of tips in tree but missing in FASTA: 0


In [8]:
import pandas as pd

# Assuming df.iloc[:, 1:-13] contains the feature counts (ASVs)
# Calculate the sum of each feature (ASV) across all samples
feature_sums = df.iloc[:, 1:-13].sum(axis=0)

# Identify features with zero counts
features_with_zero_counts = feature_sums[feature_sums == 0].index

print(f"Number of features with zero counts: {len(features_with_zero_counts)}")
print("First 5 features with zero counts:", features_with_zero_counts[:5])


Number of features with zero counts: 1765
First 5 features with zero counts: Index(['5663d3189e63b5a0d313a9d72fe36219', '12c59436fc6878f112dbe4349a94fc55',
       '04a5ef3b91567108b45c784b29537285', 'b3178f890057092ec418c3eb87734da9',
       '9c5914841ac3fee3b40ec70a658b8a6f'],
      dtype='object')


In [9]:
from Bio import SeqIO

# Load the FASTA file
fasta_file = "2_feature_rep_seq.fasta"
fasta_feature_ids = [record.id for record in SeqIO.parse(fasta_file, "fasta")]

# Find the intersection of features with zero counts and those in the FASTA file
features_to_remove_from_fasta = set(features_with_zero_counts).intersection(fasta_feature_ids)

print(f"Number of features to remove from FASTA: {len(features_to_remove_from_fasta)}")
print("First 5 features to remove from FASTA:", list(features_to_remove_from_fasta)[:5])


Number of features to remove from FASTA: 1765
First 5 features to remove from FASTA: ['19328c951805383902ac2b3bc1fba6cc', 'fb38adbf606b98eeece28959426158a4', '9ed2882583b59fb263020077488d7286', '5ebc9e5b8b092320e813853dfc37dfeb', '9c14f8183db54483322050f3a4ca978b']


In [12]:
# Create a new FASTA file without the features that are not in any samples
output_fasta_file = "filtered_fasta.fasta"

with open(output_fasta_file, "w") as output_handle:
    for record in SeqIO.parse(fasta_file, "fasta"):
        if record.id not in features_to_remove_from_fasta:
            SeqIO.write(record, output_handle, "fasta")

print(f"Filtered FASTA file saved as {output_fasta_file}")


Filtered FASTA file saved as filtered_fasta.fasta


In [14]:
from Bio import SeqIO

# Path to the filtered FASTA file
fasta_file = "filtered_fasta.fasta"

# Count the number of feature IDs (ASVs) in the FASTA file
fasta_feature_ids = [record.id for record in SeqIO.parse(fasta_file, "fasta")]

# Print the number of feature IDs in the FASTA file
print(f"Number of feature IDs in filtered FASTA: {len(fasta_feature_ids)}")


import pandas as pd

# Path to the cleaned ASV reads CSV file
csv_file = "cleaned_asv_reads.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file)

# Assuming ASVs are columns, count the number of ASV feature IDs (exclude non-ASV columns like sample metadata)
# For example, assuming ASVs are from the second column onwards
csv_feature_ids = df.columns[1:]

# Print the number of feature IDs in the CSV file
print(f"Number of feature IDs in cleaned ASV CSV: {len(csv_feature_ids)}")



from Bio import SeqIO

# Path to the filtered FASTA file
fasta_file = "filtered_fasta.fasta"

# Count the number of feature IDs (ASVs) in the FASTA file
fasta_feature_ids = [record.id for record in SeqIO.parse(fasta_file, "fasta")]

# Print the number of feature IDs in the FASTA file
print(f"Number of feature IDs in filtered FASTA: {len(fasta_feature_ids)}")


# Compare the number of IDs
if len(fasta_feature_ids) == len(csv_feature_ids):
    print("The number of feature IDs is the same in both files.")
else:
    print(f"The number of feature IDs differs between the files.")
    print(f"Number of feature IDs in filtered FASTA: {len(fasta_feature_ids)}")
    print(f"Number of feature IDs in cleaned ASV CSV: {len(csv_feature_ids)}")


Number of feature IDs in filtered FASTA: 11814
Number of feature IDs in cleaned ASV CSV: 11813
Number of feature IDs in filtered FASTA: 11814
The number of feature IDs differs between the files.
Number of feature IDs in filtered FASTA: 11814
Number of feature IDs in cleaned ASV CSV: 11813


In [15]:
df

Unnamed: 0,d0ab2c15400fe710288526c9a33083fb,21b04aaabca0f92a5ba0a0b2eaa31aea,0b0a4bdddb823efeb9984bf862fe81f8,c13359a9f895b1a2b9fcf49ceaea0e8a,0c870b228c94b71c91663a829ef6f119,91c9e0419bfe8038aed98cba9b51cb9d,629676293377721f621917a3f268dbce,91eadffc47a7950a37aae47e5e289b92,e73c0fa69ad4221f822e2e5dcb920e76,35791f882ef8d2b3177ac86ac87889b7,...,5cc600b5f4cdd6b20b04d45033d34190,f70a4d78b3f510adc28a2639462e136e,1eae4d45f0e3214e6b92503db8ec6eea,b51cc2da6b9e829137a512e897e5ded3,1ceb1318bcc7fe42e6aaa8f3421e914b,ee931404df61f94a50909ebc684686be,92a55ef19d4202dcaec8996ed22c6f57,97f27850d68e355845fff07d7bb8f21a,4527ae01b904c7218c708a3e630bcd3b,5a83f8c7ebe5640613280f443c7a180d
0,391,694,1058,364,624,0,0,0,647,944,...,0,0,0,0,0,0,0,0,0,0
1,2782,3982,2966,1084,5608,0,0,0,840,0,...,0,0,0,0,0,0,0,0,0,0
2,754,1524,6077,330,2481,0,161,24,182,635,...,0,0,0,0,0,0,0,0,0,0
3,2547,3013,292,1254,170,0,0,0,411,59,...,0,0,0,0,0,0,0,0,0,0
4,243,48,3864,0,0,0,0,0,76,4865,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306,2258,0,2049,0,0,0,0,0,0,3771,...,0,0,0,0,0,0,0,0,0,0
307,703,277,12,3706,12,2830,0,1400,457,0,...,0,0,0,0,0,0,0,0,0,0
308,974,2522,660,1000,51,0,0,39,597,0,...,0,0,0,0,0,0,0,0,0,0
309,816,2435,47,0,0,0,0,0,65,243,...,0,0,0,0,0,0,0,0,0,0


In [16]:
import biom

# Path to the BIOM file
biom_file = "cleaned_asv_reads.biom"

# Load the BIOM table
biom_table = biom.load_table(biom_file)

# Extract feature IDs (ASVs) from the BIOM file
biom_feature_ids = biom_table.ids(axis='observation')

# Print the number of feature IDs in the BIOM file
print(f"Number of feature IDs in BIOM: {len(biom_feature_ids)}")


Number of feature IDs in BIOM: 11814


In [17]:
# Convert the feature ID lists to sets for easy comparison
fasta_feature_ids_set = set(fasta_feature_ids)
biom_feature_ids_set = set(biom_feature_ids)

# Compare the number of IDs
if len(fasta_feature_ids_set) == len(biom_feature_ids_set):
    print("The number of feature IDs is the same in both files.")
else:
    print(f"The number of feature IDs differs between the files.")
    print(f"Number of feature IDs in filtered FASTA: {len(fasta_feature_ids_set)}")
    print(f"Number of feature IDs in BIOM: {len(biom_feature_ids_set)}")

# Check for feature IDs present in one file but missing in the other
missing_in_biom = fasta_feature_ids_set - biom_feature_ids_set
print(f"Number of feature IDs in FASTA but missing in BIOM: {len(missing_in_biom)}")

missing_in_fasta = biom_feature_ids_set - fasta_feature_ids_set
print(f"Number of feature IDs in BIOM but missing in FASTA: {len(missing_in_fasta)}")

if missing_in_biom:
    print(f"Feature IDs missing in BIOM: {list(missing_in_biom)[:5]}")  # Print first 5
if missing_in_fasta:
    print(f"Feature IDs missing in FASTA: {list(missing_in_fasta)[:5]}")  # Print first 5


The number of feature IDs is the same in both files.
Number of feature IDs in FASTA but missing in BIOM: 0
Number of feature IDs in BIOM but missing in FASTA: 0
