3 Reasons for this notebook:

1. Get ASV Table with non-occuring ASV's filtered out.

2. Get ASV Table with non-occuring ASV's filtered out & excluded samples below rarefaction depth for use in Alpha/Beta Diversity notebooks.

3. Get ASV Table with non-occuring ASV's filtered out & rarefaction (n=1000) done. Ready for use for in-terminal Phylogenetic RPCA

In [1]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("FINAL_UNASSIGNED_RAW.csv")

In [5]:
reads_asv = df.iloc[:,1:-13]

In [6]:
#DELETE FEATURES WITH NO OCCURENCES

# Step 1: Calculate the number of non-zero occurrences for each feature (species)
non_zero_counts = (reads_asv != 0).sum(axis=0)

# Step 2: Identify features that occur in no samples
no_sample_features = non_zero_counts[non_zero_counts == 0]

# Step 3: Drop these features (species) from reads_filtered
if len(no_sample_features) > 0:
    reads_asv_cleaned = reads_asv.drop(columns=no_sample_features.index)
    df_reads_asv_cleaned = df.drop(columns=no_sample_features.index)
    print(f"Removed {len(no_sample_features)} species that occur in no samples.")
else:
    reads_filtered_cleaned = reads_filtered
    print("No species were removed; all species occur in at least one sample.")

# Step 4: Check the shape of the cleaned data
print(f"Original shape of reads_filtered: {reads_asv.shape}")
print(f"Shape after cleaning: {reads_asv_cleaned.shape}")

reads_asv = reads_asv_cleaned

Removed 1765 species that occur in no samples.
Original shape of reads_filtered: (311, 13579)
Shape after cleaning: (311, 11814)


In [7]:
reads_asv.to_csv("cleaned_asv_reads.csv", index = False)
df_reads_asv_cleaned.to_csv("cleaned_asv_df.csv", index = False)

In [8]:
df = df_reads_asv_cleaned

if 'Row_Sum' in df.columns:
    df = df.drop(columns=['Row_Sum'])

# Compute total reads per sample
df['Row_Sum'] = reads_asv.sum(axis=1)

# Determine the rarefaction depth (minimum read count above 20,000)
depths = df['Row_Sum'][df['Row_Sum'] > 20000]
rarefaction_depth = depths.sort_values().iloc[0]

# Filter out samples with total reads less than the rarefaction depth
filtered_df = df[df['Row_Sum'] >= rarefaction_depth].copy().reset_index(drop=True)
reads_asv_filtered = filtered_df.iloc[:, 1:-14]

In [9]:
filtered_df.to_csv("cleaned_filtered_asv_df.csv", index = False)
reads_asv_filtered.to_csv("cleaned_filtered_asv_reads.csv", index = False)

In [None]:
# Function to perform rarefaction (subsampling) on a row (sample)
def rarefy_row(row, rarefaction_depth):
    """Rarefy a row by subsampling reads up to rarefaction_depth."""
    total_reads = row.sum()
    
    if total_reads < rarefaction_depth:
        return row  # If total reads are less than the rarefaction depth, return the original row
    else:
        proportions = row / total_reads
        rarefied = np.random.multinomial(rarefaction_depth, proportions)
        return rarefied


# Initialize the average matrix with zeros, the same shape as the input data
average_matrix = np.zeros((reads_asv_filtered.shape[0], reads_asv_filtered.shape[1]))

#how many times to rarefy
num_iterations = 1000

# Perform rarefaction multiple times and accumulate the results
for _ in range(num_iterations):
    rarefied_matrix = pd.DataFrame([rarefy_row(row, rarefaction_depth) for _, row in reads_asv_filtered.iterrows()], 
                                   columns=reads_asv_filtered.columns)
    average_matrix += rarefied_matrix.values

# Calculate the average matrix by dividing the accumulated matrix by the number of iterations
average_matrix /= num_iterations

# Convert back to DataFrame if needed
average_rarefied_df = pd.DataFrame(average_matrix, columns=reads_asv_filtered.columns, index=reads_asv_filtered.index)

# Optionally, display or save the averaged matrix
average_rarefied_df

In [None]:
average_rarefied_df