In [1]:
import pandas as pd
import numpy as np
from scipy.stats import gmean

In [2]:
KO = pd.read_excel("KO_pred_metagenome_unstrat_descrip.xlsx")
Path = pd.read_excel("path_abun_unstrat_descrip.xlsx")
df = pd.read_csv("cleaned_asv_df.csv")

In [3]:
KO.columns = KO.columns.str.replace(r'^[A-Za-z]_', '', regex=True)
Path.columns = Path.columns.str.replace(r'^[A-Za-z]_', '', regex=True)

In [4]:
# Get the list of Sample_IDs from df
sample_ids = df["Sample_ID"].astype(str).tolist()

# Check if each column in Path.iloc[:, 2:] is in sample_ids
columns_to_keep = [col for col in Path.columns[2:] if col in sample_ids]
# Keep the first two columns and the columns that are in sample_ids
Path_filtered = Path.iloc[:, :2].join(Path[columns_to_keep])
# If you want to update the original Path DataFrame, assign it back to Path
Path = Path_filtered

# Check if each column in Path.iloc[:, 2:] is in sample_ids
columns_to_keep = [col for col in KO.columns[2:] if col in sample_ids]
# Keep the first two columns and the columns that are in sample_ids
KO_filtered = KO.iloc[:, :2].join(Path[columns_to_keep])
# If you want to update the original Path DataFrame, assign it back to Path
KO = KO_filtered

In [5]:
# Step 1: Transpose the Path DataFrame, make the first row the column names, and set the index as Sample_ID
path_transpose = Path.T  # Transpose the DataFrame

# Set the first row as column names
path_transpose.columns = path_transpose.iloc[0]

# Drop the first row now that it is set as column names
path_transpose = path_transpose.drop(path_transpose.index[0])

# Reset the index, so you can modify it and create a Sample_ID column
path_transpose = path_transpose.reset_index()

# Rename the current index (which was the previous columns in Path) to 'Sample_ID'
path_transpose = path_transpose.rename(columns={'index': 'Sample_ID'})

# Step 1: Ensure Sample_ID is of the same type in both DataFrames
path_transpose['Sample_ID'] = path_transpose['Sample_ID'].astype(str)
df['Sample_ID'] = df['Sample_ID'].astype(str)

# Step 2: Add the last 13 columns from df to path_transpose using merge
path_transpose = path_transpose.merge(df[['Sample_ID'] + df.columns[-13:].tolist()], on='Sample_ID', how='left')

# The `merge` operation adds the last 13 columns from df to path_transpose, matching on 'Sample_ID'
Path = path_transpose

In [6]:
# Step 1: Transpose the KO DataFrame, make the first row the column names, and set the index as Sample_ID
ko_transpose = KO.T  # Transpose the DataFrame

# Set the first row as column names
ko_transpose.columns = ko_transpose.iloc[0]

# Drop the first row now that it is set as column names
ko_transpose = ko_transpose.drop(ko_transpose.index[0])

# Reset the index, so you can modify it and create a Sample_ID column
ko_transpose = ko_transpose.reset_index()

# Rename the current index (which was the previous columns in KO) to 'Sample_ID'
ko_transpose = ko_transpose.rename(columns={'index': 'Sample_ID'})

# Step 1: Ensure Sample_ID is of the same type in both DataFrames
ko_transpose['Sample_ID'] = ko_transpose['Sample_ID'].astype(str)
df['Sample_ID'] = df['Sample_ID'].astype(str)

# Step 2: Add the last 13 columns from df to ko_transpose using merge
ko_transpose = ko_transpose.merge(df[['Sample_ID'] + df.columns[-13:].tolist()], on='Sample_ID', how='left')

# The `merge` operation adds the last 13 columns from df to ko_transpose, matching on 'Sample_ID'
KO = ko_transpose


In [7]:
Path_numeric = Path.iloc[1:,1:-13]
KO_numeric = KO.iloc[1:,1:-13]

In [8]:
# Checking for columns in Path_numeric that are entirely NaN or 0
cols_to_drop_path = Path_numeric.columns[(Path_numeric.isna().all() | (Path_numeric == 0).all())]

# Checking for columns in KO_numeric that are entirely NaN or 0
cols_to_drop_ko = KO_numeric.columns[(KO_numeric.isna().all() | (KO_numeric == 0).all())]

# Dropping the columns
Path_numeric = Path_numeric.drop(columns=cols_to_drop_path)
Path = Path.drop(columns=cols_to_drop_path)
KO_numeric = KO_numeric.drop(columns=cols_to_drop_ko)
KO = KO.drop(columns=cols_to_drop_ko)

In [9]:
KO.to_csv("KO_df.csv", index=False)
Path.to_csv("Path_df.csv", index=False)

In [10]:
# Function to calculate detection limit and impute zeros with pseudo values
def impute_pseudo_values(df):
    imputed_rows = []
    for index, row in df.iterrows():
        total_sum = row.sum()
        if total_sum > 0:
            detection_limit = 1 / total_sum
            lower_bound = 0.1 * detection_limit
            upper_bound = detection_limit
            imputed_row = row.apply(
                lambda x: np.random.uniform(lower_bound, upper_bound) if x == 0 else x
            )
            imputed_rows.append(imputed_row)
        else:
            imputed_rows.append(row)
    
    # Create a new DataFrame from the imputed rows
    imputed_df = pd.DataFrame(imputed_rows, columns=df.columns, index=df.index)
    return imputed_df

# Function to perform CLR transformation
def clr(df):
    numeric_cols = df.columns
    clr_values = []
    for index, row in df.iterrows():
        numeric_row = row[numeric_cols].astype(float)
        if (numeric_row <= 0).any():
            raise ValueError(f"Non-positive values found in row {index}.")
        geom_mean = gmean(numeric_row)
        clr_row = np.log(numeric_row / geom_mean)
        clr_values.append(clr_row)
    clr_df = pd.DataFrame(clr_values, columns=numeric_cols)
    return clr_df

# Function to process Path_numeric and KO_numeric, and add back the original columns and rows
def process_data_with_originals(df_original, df_numeric):
    # Step 1: Impute pseudo values
    df_numeric_imputed = impute_pseudo_values(df_numeric.copy())
    
    # Step 2: CLR transformation
    df_numeric_clr = clr(df_numeric_imputed)
    
     # Step 3: Add back the first row and other columns and rows from the original dataframe
    df_combined = pd.concat([df_original.iloc[:1, :], df_numeric_clr], axis=0)  # Adding back the first row
    # Drop 'Sample_ID' if present
    if 'Sample_ID' in df_combined.columns:
        df_combined = df_combined.drop(columns=["Sample_ID"], axis=1)
    df_combined = pd.concat([df_original.iloc[:, :1], df_combined, df_original.iloc[:, -13:]], axis=1)  # Adding back the other columns
    
    return df_combined


In [11]:
# Applying to Path_numeric and KO_numeric
Path_combined = process_data_with_originals(Path, Path_numeric)
KO_combined = process_data_with_originals(KO, KO_numeric)
Path_combined.to_csv("Path_CLR_df.csv", index = False)
KO_combined.to_csv("KO_CLR_df.csv", index = False)