<a href="https://colab.research.google.com/github/Sebukpor/Microbiome-Cytokine-Interactions/blob/main/cytokine_profiles_microbiome_merging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

df = pd.read_csv("/content/microbes_abundances.csv", nrows=0)
print(df.columns[:30].tolist())  # show first 30 columns


['SampleID', 'Segatella copri', 'Segatella hominis', 'Segatella oris', 'Segatella bryantii', 'Segatella baroniae', 'Paraprevotella clara', 'Paraprevotella xylaniphila', 'Prevotella multiformis', 'Prevotella nigrescens', 'Prevotella melaninogenica', 'Prevotella bivia', 'Prevotella corporis', 'Prevotella denticola', 'Prevotella intermedia', 'Prevotella veroralis', 'Prevotella dentalis', 'Prevotella histicola', 'Prevotella fusca', 'Prevotella scopos', 'Prevotella jejuni', 'Prevotella sp. oral taxon 299', 'Prevotella sp. oral taxon 313', 'Prevotella sp. oral taxon 475', 'Prevotella sp. Rep29', 'Prevotella sp. E13-17', 'Prevotella sp. E2-28', 'Prevotella sp. E9-3', 'Prevotella sp. E15-22', 'Prevotella herbatica']


In [1]:
import pandas as pd

def preprocess_microbiome_species(file_path, prevalence_threshold=0.05):
    """
    Preprocess Kraken2 microbiome classification data:
    1. Collapse species → genus (take first word of species name)
    2. Filter taxa by prevalence across samples

    Args:
        file_path (str): Path to Kraken2 CSV file
        prevalence_threshold (float): Minimum fraction of samples in which
                                      a genus must be present to keep (0-1)

    Returns:
        pd.DataFrame: Preprocessed microbiome dataset with SampleID + genus abundances
    """
    # Load CSV
    df = pd.read_csv(file_path)

    if "SampleID" not in df.columns:
        raise ValueError("CSV must contain a 'SampleID' column.")

    sample_ids = df["SampleID"]
    features = df.drop(columns=["SampleID"])

    # Extract genus name (first word from species name)
    genus_names = features.columns.str.split().str[0]

    # Collapse by genus
    features_T = features.T
    features_T.index = genus_names
    genus_df = features_T.groupby(features_T.index).sum().T

    # Filter by prevalence (fraction of samples where abundance > 0)
    prevalence = (genus_df > 0).sum(axis=0) / genus_df.shape[0]
    keep_genera = prevalence[prevalence >= prevalence_threshold].index
    genus_filtered = genus_df[keep_genera]

    # Re-attach SampleID
    genus_filtered.insert(0, "SampleID", sample_ids)

    return genus_filtered


# -------- Example usage --------
microbiome_processed = preprocess_microbiome_species("/content/microbes_abundances.csv", prevalence_threshold=0.05)

print("✅ Preprocessing complete")
print("Shape:", microbiome_processed.shape)
print("Genera kept:", len(microbiome_processed.columns) - 1)
print(microbiome_processed.head())


✅ Preprocessing complete
Shape: (2901, 235)
Genera kept: 234
    SampleID  Abyssalbus  Acetivibrio  Acetobacter  Acetobacterium  \
0  ID_NZMJOU           0            0            0               0   
1  ID_WKHSUY           6            0            0               0   
2  ID_WZQPFI           0            0            0               0   
3  ID_NJQQDA           0            0            0               0   
4  ID_PVTYWN           0            0            1               0   

   Achromobacter  Acidaminococcus  Acinetobacter  Advenella  Aequorivita  ...  \
0              0              249              0          0            0  ...   
1              0                0             24          0            7  ...   
2              0                0              0          0            0  ...   
3              0                0              0          0            0  ...   
4              0                0              0          0            0  ...   

   Vibrio  Winogradskyella  Wol

In [None]:
import pandas as pd

# -------- Load metadata and cytokine profiles --------
train = pd.read_csv("/content/Train.csv")
# train columns: filename, SampleType, SubjectID, SampleID

cytokines = pd.read_csv("/content/cytokine_profiles.csv")
# cytokine columns: SampleID, IL2, IL4, ...

# microbiome_processed is already a DataFrame in memory from your preprocessing function

# -------- Step 1: Map filenames (without extension) to SampleID --------
train["filename_noext"] = train["filename"].str.replace(".mgb", "", regex=False)

# -------- Step 2: Rename microbiome_processed SampleID to match train filename --------
microbiome_processed = microbiome_processed.rename(columns={"SampleID": "FileID"})

# -------- Step 3: Merge microbiome with train metadata to get SampleID & SampleType --------
microbiome_merged = pd.merge(
    microbiome_processed,
    train[["SampleID", "SampleType", "filename_noext"]],
    left_on="FileID",
    right_on="filename_noext",
    how="inner"
)

# Reorder columns: SampleID, SampleType, then microbiome features
cols = ["SampleID", "SampleType"] + [c for c in microbiome_merged.columns if c not in ["SampleID", "SampleType", "FileID", "filename_noext"]]
microbiome_merged = microbiome_merged[cols]

# -------- Step 4: Merge with cytokine profiles on SampleID --------
final_merged = pd.merge(
    microbiome_merged,
    cytokines,
    on="SampleID",
    how="inner"
)

# -------- Step 5: Save final merged dataset as CSV --------
final_merged.to_csv("/content/merged_microbiome_cytokines.csv", index=False)

print("✅ Merged dataset saved as CSV")
print("Final shape:", final_merged.shape)
print(final_merged.head())


✅ Merged dataset saved as CSV
Final shape: (1982, 308)
        SampleID SampleType  Abyssalbus  Acetivibrio  Acetobacter  \
0  Sample_XICOMY      Stool           0            0            0   
1  Sample_XIXUXC      Mouth           0            0            0   
2  Sample_FWPZOL      Mouth           0            0            0   
3  Sample_EQKKMY      Nasal           0            0            0   
4  Sample_CITRKC      Stool           0            0            0   

   Acetobacterium  Achromobacter  Acidaminococcus  Acinetobacter  Advenella  \
0               0              0              249              0          0   
1               0              0                0              0          0   
2               0              0                0              0          0   
3               0              0                0              0          0   
4               0              0               23              0         20   

   ...        ENA78         CHEX1       CHEX2        CH

In [None]:
print(final_merged.tail())

           SampleID SampleType  Abyssalbus  Acetivibrio  Acetobacter  \
1977  Sample_GCZTEE      Stool           0            2            0   
1978  Sample_AULWKZ       Skin           0            0            0   
1979  Sample_WCDSEH      Nasal           0            0            0   
1980  Sample_MEXICC      Nasal           0            0            0   
1981  Sample_VVXXQI      Nasal           0            0            0   

      Acetobacterium  Achromobacter  Acidaminococcus  Acinetobacter  \
1977               0              0              158              0   
1978               0              0                0              0   
1979               0              0                0              0   
1980               0             47                0              0   
1981               0              0                0              0   

      Advenella  ...       ENA78         CHEX1       CHEX2        CHEX3  \
1977          0  ...  173.589093  10133.470275  581.331770  1030.

In [None]:
data= pd.read_csv('/content/merged_microbiome_cytokines.csv')

In [None]:
data.columns.tolist()

['SampleID',
 'SampleType',
 'Abyssalbus',
 'Acetivibrio',
 'Acetobacter',
 'Acetobacterium',
 'Achromobacter',
 'Acidaminococcus',
 'Acinetobacter',
 'Advenella',
 'Aequorivita',
 'Aeromonas',
 'Aestuariibaculum',
 'Agathobacter',
 'Aggregatimonas',
 'Agrococcus',
 'Agromyces',
 'Akkermansia',
 'Algibacter',
 'Alistipes',
 'Allochromatium',
 'Aminipila',
 'Amycolatopsis',
 'Amygdalobacter',
 'Anaerobutyricum',
 'Anaerocolumna',
 'Anaerostipes',
 'Anaerotruncus',
 'Antarcticibacterium',
 'Arabiibacter',
 'Arenibacter',
 'Aristaeella',
 'Azospirillum',
 'Bacillus',
 'Bacteroides',
 'Bartonella',
 'Bifidobacterium',
 'Blattabacterium',
 'Blautia',
 'Bordetella',
 'Bradyrhizobium',
 'Brucella',
 'Burkholderia',
 'Butyricimonas',
 'Caloramator',
 'Campylobacter',
 'Candidatus',
 'Capnocytophaga',
 'Caproicibacterium',
 'Catenibacterium',
 'Cellulomonas',
 'Changchengzhania',
 'Chitinophaga',
 'Chlamydia',
 'Christiangramia',
 'Chryseobacterium',
 'Citrobacter',
 'Clostridium',
 'Collinsell