In [1]:
import pandas as pd
import glob
import numpy as np
import re

# Pre-process the reads

In [3]:
df_files=pd.read_csv("bioproject_run_crc_pd.tsv",sep="\t", header=None)
df_files

Unnamed: 0,0,1
0,SRR16124367,PRJNA763023
1,SRR16124366,PRJNA763023
2,SRR16124255,PRJNA763023
3,SRR16124244,PRJNA763023
4,SRR16124233,PRJNA763023
...,...,...
1307,DRR171669,PRJDB4176
1308,DRR171670,PRJDB4176
1309,DRR171671,PRJDB4176
1310,DRR171672,PRJDB4176


In [6]:
import os
import shutil
base_dir = "SEPT_2025"

for idx, row in df_files.iterrows():
    run_id = row[0]          # SRR/ERR/DRR id
    bioproject = row[1]      # BioProject id
    
    run_path = os.path.join(base_dir, run_id)
    target_dir = os.path.join(base_dir, bioproject)
    
    if os.path.exists(run_path):
        os.makedirs(target_dir, exist_ok=True)
        
        # Move Run ID folder into its BioProject folder
        target_path = os.path.join(target_dir, run_id)
        if not os.path.exists(target_path):
            shutil.move(run_path, target_dir)
            print(f"Moved {run_id} → {bioproject}")
        else:
            print(f"Skipped {run_id}, already in {bioproject}")
    else:
        print(f"Run folder missing: {run_id}")

Moved SRR16124367 → PRJNA763023
Moved SRR16124366 → PRJNA763023
Moved SRR16124255 → PRJNA763023
Moved SRR16124244 → PRJNA763023
Moved SRR16124233 → PRJNA763023
Moved SRR16124222 → PRJNA763023
Moved SRR16124211 → PRJNA763023
Moved SRR16124200 → PRJNA763023
Moved SRR16124189 → PRJNA763023
Moved SRR16124178 → PRJNA763023
Moved SRR16124365 → PRJNA763023
Moved SRR16124354 → PRJNA763023
Moved SRR16124343 → PRJNA763023
Moved SRR16124332 → PRJNA763023
Moved SRR16124321 → PRJNA763023
Moved SRR16124310 → PRJNA763023
Moved SRR16124299 → PRJNA763023
Moved SRR16124288 → PRJNA763023
Moved SRR16124277 → PRJNA763023
Moved SRR16124266 → PRJNA763023
Moved SRR16124254 → PRJNA763023
Moved SRR16124253 → PRJNA763023
Moved SRR16124252 → PRJNA763023
Moved SRR16124251 → PRJNA763023
Moved SRR16124250 → PRJNA763023
Moved SRR16124249 → PRJNA763023
Moved SRR16124248 → PRJNA763023
Moved SRR16124247 → PRJNA763023
Moved SRR16124246 → PRJNA763023
Moved SRR16124245 → PRJNA763023
Moved SRR16124243 → PRJNA763023
Moved SR

In [1]:
import os
import pandas as pd

base_dir = "SEPT_2025"
output_dir = "SEPT_2025"

os.makedirs(output_dir, exist_ok=True)

for bioproject in os.listdir(base_dir):
    bioproject_path = os.path.join(base_dir, bioproject)
    if os.path.isdir(bioproject_path) and bioproject.startswith("PRJ"):
        records = []
        
        # List subfolders (Run IDs)
        for subfolder in os.listdir(bioproject_path):
            subfolder_path = os.path.join(bioproject_path, subfolder)
            if os.path.isdir(subfolder_path):
                records.append([bioproject, subfolder])
        
        if records:
            df = pd.DataFrame(records)
            output_file = os.path.join(output_dir, f"{bioproject}_samples.tsv")
            df.to_csv(output_file, sep=",", index=False, header=False)
            print(f"✅ Saved {output_file}")


In [31]:
import os
import re
import pandas as pd

# Directory jaha aapki .err files hain
err_dir = "processing/"
# output_file = "summary_table.tsv"

# Regex patterns
patterns = {
    "BioProject": re.compile(r"JSON report:.*/(PRJ\w+)/"),
    "Sample": re.compile(r"JSON report:.*/(SRR\d+)_fastp.json"),
    "Read1_before": re.compile(r"Read1 before filtering:\s*total reads:\s*(\d+)"),
    "Read1_after": re.compile(r"Read1 after filtering:\s*total reads:\s*(\d+)"),
    "Read2_before": re.compile(r"Read2 before filtering:\s*total reads:\s*(\d+)"),
    "Read2_after": re.compile(r"Read2 after filtering:\s*total reads:\s*(\d+)"),
    "Adapter_trimmed": re.compile(r"reads with adapter trimmed:\s*(\d+)"),
    "Kraken_total": re.compile(r"(\d+) sequences \("),
    "Classified_reads": re.compile(r"(\d+) sequences classified \(([\d\.]+)%\)"),
    "Unclassified_reads": re.compile(r"(\d+) sequences unclassified \(([\d\.]+)%\)")
}

rows = []

# Iterate over all .err files
for fname in os.listdir(err_dir):
    if fname.endswith(".err"):
        with open(os.path.join(err_dir, fname)) as f:
            text = f.read()

        row = {"File": fname}
        for key, pat in patterns.items():
            match = pat.search(text)
            if match:
                if key == "Classified_reads":
                    row["Classified_reads"] = match.group(1)
                    row["Classified_%"] = match.group(2)
                elif key == "Unclassified_reads":
                    row["Unclassified_reads"] = match.group(1)
                    row["Unclassified_%"] = match.group(2)
                else:
                    row[key] = match.group(1)

        rows.append(row)

# Convert to DataFrame
df = pd.DataFrame(rows)

# Reorder columns
df = df[[
    "BioProject", "Sample", "Read1_before", "Read1_after",
    "Read2_before", "Read2_after", "Adapter_trimmed",
    "Kraken_total", "Classified_reads", "Classified_%",
    "Unclassified_reads", "Unclassified_%", "File"
]]

df


Unnamed: 0,BioProject,Sample,Read1_before,Read1_after,Read2_before,Read2_after,Adapter_trimmed,Kraken_total,Classified_reads,Classified_%,Unclassified_reads,Unclassified_%,File
0,PRJNA447983,SRR6915104,19929853,19606863,19929853,19606863,275752,39213726,34465845,87.89,4747881,12.11,693749_7_PRJNA447983_SRR6915104.err
1,,,,,,,,,,,,,693419_372_PRJDB4176_DRR171749.err
2,PRJNA1237248,SRR32733101,63543388,63519518,,,0,63519518,56798216,89.42,6721302,10.58,234325_776_PRJNA1237248_SRR32733101.err
3,PRJNA447983,SRR6915202,38260125,37605760,38260125,37605760,97838,75211520,67052033,89.15,8159487,10.85,693749_60_PRJNA447983_SRR6915202.err
4,PRJNA429097,SRR6456233,25150422,23282694,25150422,23282694,40784,46565388,42068321,90.34,4497067,9.66,693416_47_PRJNA429097_SRR6456233.err
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2978,PRJNA1237248,SRR32733605,63354398,63326505,,,0,63326505,55203524,87.17,8122981,12.83,234325_639_PRJNA1237248_SRR32733605.err
2979,PRJNA447983,SRR6915207,53029256,52187816,53029256,52187816,193256,104375632,92615496,88.73,11760136,11.27,693749_69_PRJNA447983_SRR6915207.err
2980,,,,,,,,,,,,,693419_297_PRJDB4176_DRR171558.err
2981,PRJNA1237248,SRR32733401,37355963,37341652,,,0,37341652,33929209,90.86,3412443,9.14,234325_48_PRJNA1237248_SRR32733401.err


In [32]:
df_clean = df.dropna(subset=['BioProject'])
df_clean.to_csv("pre_processed_read_informations.tsv",sep="\t",index="False")
df_clean

Unnamed: 0,BioProject,Sample,Read1_before,Read1_after,Read2_before,Read2_after,Adapter_trimmed,Kraken_total,Classified_reads,Classified_%,Unclassified_reads,Unclassified_%,File
0,PRJNA447983,SRR6915104,19929853,19606863,19929853,19606863,275752,39213726,34465845,87.89,4747881,12.11,693749_7_PRJNA447983_SRR6915104.err
2,PRJNA1237248,SRR32733101,63543388,63519518,,,0,63519518,56798216,89.42,6721302,10.58,234325_776_PRJNA1237248_SRR32733101.err
3,PRJNA447983,SRR6915202,38260125,37605760,38260125,37605760,97838,75211520,67052033,89.15,8159487,10.85,693749_60_PRJNA447983_SRR6915202.err
4,PRJNA429097,SRR6456233,25150422,23282694,25150422,23282694,40784,46565388,42068321,90.34,4497067,9.66,693416_47_PRJNA429097_SRR6456233.err
6,PRJNA1237248,SRR32733279,35463020,35448640,,,0,35448640,30132251,85.00,5316389,15.00,234325_624_PRJNA1237248_SRR32733279.err
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2976,PRJNA1237248,SRR32733551,43597349,43577017,,,0,43577017,38846389,89.14,4730628,10.86,234325_748_PRJNA1237248_SRR32733551.err
2977,PRJEB72525,,10523082,6210161,10523082,6210161,362058,12420322,10302508,82.95,2117814,17.05,232906_2_PRJEB72525_ERR14205363.err
2978,PRJNA1237248,SRR32733605,63354398,63326505,,,0,63326505,55203524,87.17,8122981,12.83,234325_639_PRJNA1237248_SRR32733605.err
2979,PRJNA447983,SRR6915207,53029256,52187816,53029256,52187816,193256,104375632,92615496,88.73,11760136,11.27,693749_69_PRJNA447983_SRR6915207.err


In [17]:
df_clean = df_clean.apply(pd.to_numeric, errors='ignore')
df_clean

  df_clean = df_clean.apply(pd.to_numeric, errors='ignore')


Unnamed: 0,BioProject,Sample,Read1_before,Read1_after,Read2_before,Read2_after,Adapter_trimmed,Kraken_total,Classified_reads,Classified_%,Unclassified_reads,Unclassified_%,File
0,PRJNA447983,SRR6915104,19929853,19606863,19929853.0,19606863.0,275752,39213726.0,34465845.0,87.89,4747881.0,12.11,693749_7_PRJNA447983_SRR6915104.err
2,PRJNA1237248,SRR32733101,63543388,63519518,,,0,63519518.0,56798216.0,89.42,6721302.0,10.58,234325_776_PRJNA1237248_SRR32733101.err
3,PRJNA447983,SRR6915202,38260125,37605760,38260125.0,37605760.0,97838,75211520.0,67052033.0,89.15,8159487.0,10.85,693749_60_PRJNA447983_SRR6915202.err
4,PRJNA429097,SRR6456233,25150422,23282694,25150422.0,23282694.0,40784,46565388.0,42068321.0,90.34,4497067.0,9.66,693416_47_PRJNA429097_SRR6456233.err
6,PRJNA1237248,SRR32733279,35463020,35448640,,,0,35448640.0,30132251.0,85.00,5316389.0,15.00,234325_624_PRJNA1237248_SRR32733279.err
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2976,PRJNA1237248,SRR32733551,43597349,43577017,,,0,43577017.0,38846389.0,89.14,4730628.0,10.86,234325_748_PRJNA1237248_SRR32733551.err
2977,PRJEB72525,,10523082,6210161,10523082.0,6210161.0,362058,12420322.0,10302508.0,82.95,2117814.0,17.05,232906_2_PRJEB72525_ERR14205363.err
2978,PRJNA1237248,SRR32733605,63354398,63326505,,,0,63326505.0,55203524.0,87.17,8122981.0,12.83,234325_639_PRJNA1237248_SRR32733605.err
2979,PRJNA447983,SRR6915207,53029256,52187816,53029256.0,52187816.0,193256,104375632.0,92615496.0,88.73,11760136.0,11.27,693749_69_PRJNA447983_SRR6915207.err


In [30]:
df_clean["BioProject"].value_counts()

BioProject
PRJNA1237248    853
PRJEB72524      203
PRJNA763023     200
PRJNA429097     191
PRJEB72523      163
PRJNA731589     163
PRJNA447983     140
PRJEB59350      134
PRJEB72525      124
PRJNA1167935     71
PRJEB72526       60
PRJNA531273      30
Name: count, dtype: int64

In [24]:
df_clean_less = df_clean[~(df_clean["Classified_reads"] > 80)]
df_clean_less

Unnamed: 0,BioProject,Sample,Read1_before,Read1_after,Read2_before,Read2_after,Adapter_trimmed,Kraken_total,Classified_reads,Classified_%,Unclassified_reads,Unclassified_%,File
513,PRJNA1237248,SRR32733041,50240945,50218398,,,0,,,,,,234325_744_PRJNA1237248_SRR32733041.err
843,PRJNA1237248,SRR32733216,39514110,39493544,,,0,,,,,,234325_761_PRJNA1237248_SRR32733216.err
2037,PRJNA1237248,SRR32733261,43269800,43251538,,,0,,,,,,234325_816_PRJNA1237248_SRR32733261.err


In [None]:
# Remove the samples which is <80% 

In [None]:
#1 min number of reads in each sample, should be at least 1000000 (1million, 100K)
#2 Per Bioproject classified reads = average reads in millions, 


In [11]:
df.describe()

Unnamed: 0,BioProject,Sample,Read1_before,Read1_after,Read2_before,Read2_after,Adapter_trimmed,Kraken_total,Classified_reads,Classified_%,Unclassified_reads,Unclassified_%,File
count,2332,1648,2332,2332,1479,1479,2332,2329,2329,2329.0,2329,2329.0,2983
unique,12,1648,2332,2332,1479,1479,1474,2329,2329,932.0,2329,932.0,2983
top,PRJNA1237248,SRR6915104,19929853,19606863,19929853,19606863,0,39213726,34465845,89.03,4747881,10.97,693749_7_PRJNA447983_SRR6915104.err
freq,853,1,1,1,1,1,853,1,1,11.0,1,11.0,1


In [37]:
# Metadata
meta_df=pd.read_csv("meadata_crc.tsv",sep="\t")
meta_df = meta_df[~(meta_df["BioProject"] == "PRJDB4176")]
meta_df

Unnamed: 0,Run_ID,BioProject,BioSample,Health_status,Phenotype,Full_Name,Sex,Age,Location,Platform,BMI,CRC stage
0,SRR30861073,PRJNA1167935,SAMN44019356,Non-Healthy,CRC,Colorectal Cancer,FEMALE,75.0,Turkey,ILLUMINA,,II
1,SRR30861074,PRJNA1167935,SAMN44019355,Non-Healthy,CRC,Colorectal Cancer,MALE,55.0,Turkey,ILLUMINA,,IV
2,SRR30861075,PRJNA1167935,SAMN44019379,Healthy,Healthy,Healthy,MALE,64.0,Turkey,ILLUMINA,,
3,SRR30861076,PRJNA1167935,SAMN44019377,Healthy,Healthy,Healthy,FEMALE,46.0,Turkey,ILLUMINA,,
4,SRR30861077,PRJNA1167935,SAMN44019374,Healthy,Healthy,Healthy,MALE,52.0,Turkey,ILLUMINA,,
...,...,...,...,...,...,...,...,...,...,...,...,...
5139,ERR1018308,PRJEB10878,SAMEA3541592,Healthy,Healthy,Healthy,MALE,61.0,China,Illumina HiSeq 2000 platform,23.8,
5140,ERR1018309,PRJEB10878,SAMEA3541593,Healthy,Healthy,Healthy,MALE,62.0,China,Illumina HiSeq 2000 platform,21.5,
5141,ERR1018310,PRJEB10878,SAMEA3541594,Non-healthy,CRC,Colorectal Cancer,MALE,68.0,China,Illumina HiSeq 2000 platform,23.7,
5142,ERR1018311,PRJEB10878,SAMEA3541595,Non-healthy,CRC,Colorectal Cancer,MALE,55.0,China,Illumina HiSeq 2000 platform,25.8,


In [42]:
df_otu=pd.read_csv("combined_4168_crc_raw_out.tsv",sep="\t")
df_otu

Unnamed: 0,name,taxonomy_id,taxonomy_lvl,ERR1018185.bracken.out_num,ERR1018185.bracken.out_frac,ERR1018186.bracken.out_num,ERR1018186.bracken.out_frac,ERR1018187.bracken.out_num,ERR1018187.bracken.out_frac,ERR1018188.bracken.out_num,...,SRR8865597.bracken.out_num,SRR8865597.bracken.out_frac,SRR8865598.bracken.out_num,SRR8865598.bracken.out_frac,SRR8865599.bracken.out_num,SRR8865599.bracken.out_frac,SRR8865600.bracken.out_num,SRR8865600.bracken.out_frac,SRR8865601.bracken.out_num,SRR8865601.bracken.out_frac
0,CAG-83 sp900545495,1494,S,216399,0.00460,4410,0.00011,6340,0.00014,2609,...,2925,0.00053,1666,0.00008,26,0.00000,3917,0.00082,51,0.00001
1,CAG-83 sp000435555,3759,S,166900,0.00355,3796,0.00009,9058,0.00021,11143,...,73394,0.01325,3854,0.00019,71,0.00001,9491,0.00198,291,0.00005
2,CAG-83 sp003539495,3631,S,157833,0.00336,2709,0.00007,122592,0.00279,10902,...,1572,0.00028,4464,0.00023,20,0.00000,14572,0.00304,236,0.00004
3,CAG-83 sp000431575,3975,S,133941,0.00285,310679,0.00773,13939,0.00032,555676,...,1219,0.00022,645,0.00003,439,0.00008,773,0.00016,654,0.00011
4,CAG-83 sp900547745,5691,S,95637,0.00203,2756,0.00007,2717,0.00006,2299,...,2320,0.00042,1067,0.00005,32,0.00001,2636,0.00055,58,0.00001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4625,Acinetobacter sp900766635,4500,S,0,0.00000,0,0.00000,0,0.00000,0,...,0,0.00000,0,0.00000,0,0.00000,0,0.00000,0,0.00000
4626,Rs-D84 sp900550495,4778,S,0,0.00000,0,0.00000,0,0.00000,0,...,342,0.00006,0,0.00000,0,0.00000,0,0.00000,0,0.00000
4627,MGYG000004418,5614,S,0,0.00000,0,0.00000,0,0.00000,0,...,0,0.00000,0,0.00000,0,0.00000,0,0.00000,0,0.00000
4628,Rs-D84 sp900550565,4786,S,0,0.00000,0,0.00000,0,0.00000,0,...,97,0.00002,33,0.00000,0,0.00000,0,0.00000,0,0.00000


In [43]:
df_otu = df_otu.drop(columns=[col for col in df_otu.columns if col.endswith(".out_frac")])
df_otu

Unnamed: 0,name,taxonomy_id,taxonomy_lvl,ERR1018185.bracken.out_num,ERR1018186.bracken.out_num,ERR1018187.bracken.out_num,ERR1018188.bracken.out_num,ERR1018189.bracken.out_num,ERR1018190.bracken.out_num,ERR1018191.bracken.out_num,...,SRR8865592.bracken.out_num,SRR8865593.bracken.out_num,SRR8865594.bracken.out_num,SRR8865595.bracken.out_num,SRR8865596.bracken.out_num,SRR8865597.bracken.out_num,SRR8865598.bracken.out_num,SRR8865599.bracken.out_num,SRR8865600.bracken.out_num,SRR8865601.bracken.out_num
0,CAG-83 sp900545495,1494,S,216399,4410,6340,2609,10606,1846,100,...,438,3032,466,18,0,2925,1666,26,3917,51
1,CAG-83 sp000435555,3759,S,166900,3796,9058,11143,11613,2727,1275,...,1744,73094,498,21,0,73394,3854,71,9491,291
2,CAG-83 sp003539495,3631,S,157833,2709,122592,10902,5796,1173,710,...,18562,1743,130,0,0,1572,4464,20,14572,236
3,CAG-83 sp000431575,3975,S,133941,310679,13939,555676,86503,304452,2920,...,1175,2537,45,38,21,1219,645,439,773,654
4,CAG-83 sp900547745,5691,S,95637,2756,2717,2299,12152,1573,244,...,233,1847,331,15,0,2320,1067,32,2636,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4625,Acinetobacter sp900766635,4500,S,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4626,Rs-D84 sp900550495,4778,S,0,0,0,0,0,0,0,...,0,0,0,0,0,342,0,0,0,0
4627,MGYG000004418,5614,S,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4628,Rs-D84 sp900550565,4786,S,0,0,0,0,0,0,0,...,0,0,100,0,0,97,33,0,0,0


In [44]:
df_otu.columns = df_otu.columns.str.replace(".bracken.out_num", "", regex=False)
df_otu

Unnamed: 0,name,taxonomy_id,taxonomy_lvl,ERR1018185,ERR1018186,ERR1018187,ERR1018188,ERR1018189,ERR1018190,ERR1018191,...,SRR8865592,SRR8865593,SRR8865594,SRR8865595,SRR8865596,SRR8865597,SRR8865598,SRR8865599,SRR8865600,SRR8865601
0,CAG-83 sp900545495,1494,S,216399,4410,6340,2609,10606,1846,100,...,438,3032,466,18,0,2925,1666,26,3917,51
1,CAG-83 sp000435555,3759,S,166900,3796,9058,11143,11613,2727,1275,...,1744,73094,498,21,0,73394,3854,71,9491,291
2,CAG-83 sp003539495,3631,S,157833,2709,122592,10902,5796,1173,710,...,18562,1743,130,0,0,1572,4464,20,14572,236
3,CAG-83 sp000431575,3975,S,133941,310679,13939,555676,86503,304452,2920,...,1175,2537,45,38,21,1219,645,439,773,654
4,CAG-83 sp900547745,5691,S,95637,2756,2717,2299,12152,1573,244,...,233,1847,331,15,0,2320,1067,32,2636,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4625,Acinetobacter sp900766635,4500,S,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4626,Rs-D84 sp900550495,4778,S,0,0,0,0,0,0,0,...,0,0,0,0,0,342,0,0,0,0
4627,MGYG000004418,5614,S,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4628,Rs-D84 sp900550565,4786,S,0,0,0,0,0,0,0,...,0,0,100,0,0,97,33,0,0,0


In [46]:
df_out_filtered=df_otu.drop(columns=['taxonomy_id','taxonomy_lvl'])
df_out_filtered

Unnamed: 0,name,ERR1018185,ERR1018186,ERR1018187,ERR1018188,ERR1018189,ERR1018190,ERR1018191,ERR1018192,ERR1018193,...,SRR8865592,SRR8865593,SRR8865594,SRR8865595,SRR8865596,SRR8865597,SRR8865598,SRR8865599,SRR8865600,SRR8865601
0,CAG-83 sp900545495,216399,4410,6340,2609,10606,1846,100,234,1330,...,438,3032,466,18,0,2925,1666,26,3917,51
1,CAG-83 sp000435555,166900,3796,9058,11143,11613,2727,1275,407,1276,...,1744,73094,498,21,0,73394,3854,71,9491,291
2,CAG-83 sp003539495,157833,2709,122592,10902,5796,1173,710,282,30551,...,18562,1743,130,0,0,1572,4464,20,14572,236
3,CAG-83 sp000431575,133941,310679,13939,555676,86503,304452,2920,37209,61607,...,1175,2537,45,38,21,1219,645,439,773,654
4,CAG-83 sp900547745,95637,2756,2717,2299,12152,1573,244,166,762,...,233,1847,331,15,0,2320,1067,32,2636,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4625,Acinetobacter sp900766635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4626,Rs-D84 sp900550495,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,342,0,0,0,0
4627,MGYG000004418,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4628,Rs-D84 sp900550565,0,0,0,0,0,0,0,0,0,...,0,0,100,0,0,97,33,0,0,0


In [47]:
df_out_filtered.to_csv("/lustre/home/babluuniba2022/CRC_VALIDATION_2025/CRC_bracken/braacken_all/final_out_for_model_training.tsv",sep="\t", index=False)
df_out_filtered


Unnamed: 0,name,ERR1018185,ERR1018186,ERR1018187,ERR1018188,ERR1018189,ERR1018190,ERR1018191,ERR1018192,ERR1018193,...,SRR8865592,SRR8865593,SRR8865594,SRR8865595,SRR8865596,SRR8865597,SRR8865598,SRR8865599,SRR8865600,SRR8865601
0,CAG-83 sp900545495,216399,4410,6340,2609,10606,1846,100,234,1330,...,438,3032,466,18,0,2925,1666,26,3917,51
1,CAG-83 sp000435555,166900,3796,9058,11143,11613,2727,1275,407,1276,...,1744,73094,498,21,0,73394,3854,71,9491,291
2,CAG-83 sp003539495,157833,2709,122592,10902,5796,1173,710,282,30551,...,18562,1743,130,0,0,1572,4464,20,14572,236
3,CAG-83 sp000431575,133941,310679,13939,555676,86503,304452,2920,37209,61607,...,1175,2537,45,38,21,1219,645,439,773,654
4,CAG-83 sp900547745,95637,2756,2717,2299,12152,1573,244,166,762,...,233,1847,331,15,0,2320,1067,32,2636,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4625,Acinetobacter sp900766635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4626,Rs-D84 sp900550495,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,342,0,0,0,0
4627,MGYG000004418,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4628,Rs-D84 sp900550565,0,0,0,0,0,0,0,0,0,...,0,0,100,0,0,97,33,0,0,0


In [48]:
# Sum reads for each sample (skip 'name' column)
total_reads_per_sample = df_out_filtered.drop(columns=['name']).sum(axis=0)

# If you want, convert to DataFrame
total_reads_df = total_reads_per_sample.reset_index()
total_reads_df.columns = ['Sample', 'Total_reads']
total_reads_df

Unnamed: 0,Sample,Total_reads
0,ERR1018185,47041531
1,ERR1018186,40213885
2,ERR1018187,43884602
3,ERR1018188,43014049
4,ERR1018189,47641290
...,...,...
4163,SRR8865597,5541014
4164,SRR8865598,19787114
4165,SRR8865599,5772450
4166,SRR8865600,4790513


In [49]:
total_reads_df.describe()

Unnamed: 0,Total_reads
count,4168.0
mean,27269100.0
std,22954510.0
min,30147.0
25%,6988900.0
50%,24309360.0
75%,41983260.0
max,184408000.0


In [53]:
total_reads_df["Total_reads"] = pd.to_numeric(total_reads_df["Total_reads"], errors='coerce')
high_depth_samples = total_reads_df[total_reads_df["Total_reads"] > 1000000]
high_depth_samples

Unnamed: 0,Sample,Total_reads
0,ERR1018185,47041531
1,ERR1018186,40213885
2,ERR1018187,43884602
3,ERR1018188,43014049
4,ERR1018189,47641290
...,...,...
4163,SRR8865597,5541014
4164,SRR8865598,19787114
4165,SRR8865599,5772450
4166,SRR8865600,4790513


In [54]:
4168-3714

454

In [7]:
df["BioProject"].value_counts()

BioProject
PRJNA1237248    853
PRJEB72524      203
PRJNA763023     200
PRJNA429097     191
PRJEB72523      163
PRJNA731589     163
PRJNA447983     140
PRJEB59350      134
PRJEB72525      124
PRJNA1167935     71
PRJEB72526       60
PRJNA531273      30
Name: count, dtype: int64

# For the Prevelance based analysis...

In [4]:
df_out=pd.read_csv("final_out_for_model_training.tsv",sep="\t")
df_out.index=df_out["name"].values
df_out=df_out.drop(columns="name")

df_out


Unnamed: 0,ERR1018185,ERR1018186,ERR1018187,ERR1018188,ERR1018189,ERR1018190,ERR1018191,ERR1018192,ERR1018193,ERR1018194,...,SRR8865592,SRR8865593,SRR8865594,SRR8865595,SRR8865596,SRR8865597,SRR8865598,SRR8865599,SRR8865600,SRR8865601
CAG-83 sp900545495,216399,4410,6340,2609,10606,1846,100,234,1330,407,...,438,3032,466,18,0,2925,1666,26,3917,51
CAG-83 sp000435555,166900,3796,9058,11143,11613,2727,1275,407,1276,477,...,1744,73094,498,21,0,73394,3854,71,9491,291
CAG-83 sp003539495,157833,2709,122592,10902,5796,1173,710,282,30551,152,...,18562,1743,130,0,0,1572,4464,20,14572,236
CAG-83 sp000431575,133941,310679,13939,555676,86503,304452,2920,37209,61607,17855,...,1175,2537,45,38,21,1219,645,439,773,654
CAG-83 sp900547745,95637,2756,2717,2299,12152,1573,244,166,762,181,...,233,1847,331,15,0,2320,1067,32,2636,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Acinetobacter sp900766635,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Rs-D84 sp900550495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,342,0,0,0,0
MGYG000004418,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Rs-D84 sp900550565,0,0,0,0,0,0,0,0,0,0,...,0,0,100,0,0,97,33,0,0,0


In [32]:
sample_sums = df_out.sum(axis=0)

# Find samples with total reads < 100k
low_read_samples = sample_sums[sample_sums < 100000].index.tolist()
print(f"Number of low-read samples (<100k): {len(low_read_samples)}")

df_filtered = df_out.drop(columns=low_read_samples)
df_filtered

Number of low-read samples (<100k): 5


Unnamed: 0,ERR1018185,ERR1018186,ERR1018187,ERR1018188,ERR1018189,ERR1018190,ERR1018191,ERR1018192,ERR1018193,ERR1018194,...,SRR8865592,SRR8865593,SRR8865594,SRR8865595,SRR8865596,SRR8865597,SRR8865598,SRR8865599,SRR8865600,SRR8865601
CAG-83 sp900545495,216399,4410,6340,2609,10606,1846,100,234,1330,407,...,438,3032,466,18,0,2925,1666,26,3917,51
CAG-83 sp000435555,166900,3796,9058,11143,11613,2727,1275,407,1276,477,...,1744,73094,498,21,0,73394,3854,71,9491,291
CAG-83 sp003539495,157833,2709,122592,10902,5796,1173,710,282,30551,152,...,18562,1743,130,0,0,1572,4464,20,14572,236
CAG-83 sp000431575,133941,310679,13939,555676,86503,304452,2920,37209,61607,17855,...,1175,2537,45,38,21,1219,645,439,773,654
CAG-83 sp900547745,95637,2756,2717,2299,12152,1573,244,166,762,181,...,233,1847,331,15,0,2320,1067,32,2636,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Acinetobacter sp900766635,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Rs-D84 sp900550495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,342,0,0,0,0
MGYG000004418,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Rs-D84 sp900550565,0,0,0,0,0,0,0,0,0,0,...,0,0,100,0,0,97,33,0,0,0


In [33]:
import pandas as pd
import numpy as np

# Calculate prevalence for each species
species_prevalence = (df_filtered > 0).sum(axis=1) / df_filtered.shape[1]
species_prevalence_percent = (species_prevalence * 100).round().astype(int)  # round to nearest integer

# Count species at each exact prevalence percentage
prevalence_counts = species_prevalence_percent.value_counts().reindex(range(0, 101), fill_value=0)

print("Number of species at each exact prevalence percentage (0–100%):")
print(prevalence_counts)


Number of species at each exact prevalence percentage (0–100%):
0        2
1        8
2       10
3       13
4       15
      ... 
96      91
97     101
98     112
99     140
100     83
Name: count, Length: 101, dtype: int64


In [41]:
species_selected = df_filtered[species_prevalence_percent >= 20]
species_selected

Unnamed: 0,ERR1018185,ERR1018186,ERR1018187,ERR1018188,ERR1018189,ERR1018190,ERR1018191,ERR1018192,ERR1018193,ERR1018194,...,SRR8865592,SRR8865593,SRR8865594,SRR8865595,SRR8865596,SRR8865597,SRR8865598,SRR8865599,SRR8865600,SRR8865601
CAG-83 sp900545495,216399,4410,6340,2609,10606,1846,100,234,1330,407,...,438,3032,466,18,0,2925,1666,26,3917,51
CAG-83 sp000435555,166900,3796,9058,11143,11613,2727,1275,407,1276,477,...,1744,73094,498,21,0,73394,3854,71,9491,291
CAG-83 sp003539495,157833,2709,122592,10902,5796,1173,710,282,30551,152,...,18562,1743,130,0,0,1572,4464,20,14572,236
CAG-83 sp000431575,133941,310679,13939,555676,86503,304452,2920,37209,61607,17855,...,1175,2537,45,38,21,1219,645,439,773,654
CAG-83 sp900547745,95637,2756,2717,2299,12152,1573,244,166,762,181,...,233,1847,331,15,0,2320,1067,32,2636,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Lentilactobacillus kefiri,0,0,0,0,0,0,0,0,0,0,...,0,0,0,235,167,0,313,116,0,13
Latilactobacillus curvatus,0,0,0,0,0,0,0,0,0,0,...,0,0,0,11,11,0,52,67,0,10
Lentilactobacillus buchneri,0,0,0,0,0,0,0,0,0,0,...,0,0,0,13,28,0,19,11,0,0
Campylobacter_A concisus_R,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
import pandas as pd
import numpy as np

# Assuming your dataframe is df_filtered (species x samples)

# Step 1: Add pseudocount to avoid log(0)
df_pseudo = species_selected + 1

# Step 2: Compute geometric mean for each species (row)
# CLR is log(value / geometric mean of row)
geometric_mean = np.exp(np.log(df_pseudo).mean(axis=1))  # row-wise geometric mean

# Step 3: Apply CLR transformation
df_clr = np.log(df_pseudo.div(geometric_mean, axis=0))

print("CLR-transformed table:")
print(df_clr)


CLR-transformed table:
                             ERR1018185  ERR1018186  ERR1018187  ERR1018188  \
CAG-83 sp900545495             5.611436    1.718409    2.081344    1.193658   
CAG-83 sp000435555             4.164486    0.381296    1.250844    1.457986   
CAG-83 sp003539495             5.471456    1.406861    5.218782    2.798950   
CAG-83 sp000431575             3.752683    4.594039    1.490038    5.175463   
CAG-83 sp900547745             5.222157    1.675730    1.661483    1.494496   
...                                 ...         ...         ...         ...   
Lentilactobacillus kefiri     -0.943185   -0.943185   -0.943185   -0.943185   
Latilactobacillus curvatus    -1.617676   -1.617676   -1.617676   -1.617676   
Lentilactobacillus buchneri   -1.079104   -1.079104   -1.079104   -1.079104   
Campylobacter_A concisus_R    -1.058664   -1.058664   -1.058664   -1.058664   
CAG-312 sp900545715           -1.260976   -1.260976   -1.260976   -1.260976   

                            

In [44]:
clr_df=df_clr.T
clr_df

Unnamed: 0,CAG-83 sp900545495,CAG-83 sp000435555,CAG-83 sp003539495,CAG-83 sp000431575,CAG-83 sp900547745,CAG-83 sp900548615,CAG-83 sp900552475,CAG-83 sp001916855,CAG-83 sp900554275,CAG-83 sp900545585,...,Lactobacillus helveticus,CAG-568 sp000434395,Zag1 sp001917115,MGYG000000581,CAG-267 sp001917135,Lentilactobacillus kefiri,Latilactobacillus curvatus,Lentilactobacillus buchneri,Campylobacter_A concisus_R,CAG-312 sp900545715
ERR1018185,5.611436,4.164486,5.471456,3.752683,5.222157,4.121749,4.846377,3.510590,3.392057,3.131909,...,-1.721541,-1.158925,-1.201397,-0.921585,-1.917436,-0.943185,-1.617676,-1.079104,-1.058664,-1.260976
ERR1018186,1.718409,0.381296,1.406861,4.594039,1.675730,1.377180,4.549167,2.147080,2.682404,1.712754,...,-1.721541,-1.158925,-1.201397,-0.921585,-1.917436,-0.943185,-1.617676,-1.079104,-1.058664,-1.260976
ERR1018187,2.081344,1.250844,5.218782,1.490038,1.661483,1.590397,3.113737,4.505178,2.419121,2.109703,...,-1.721541,-1.158925,-1.201397,-0.921585,-1.917436,-0.943185,-1.617676,-1.079104,-1.058664,-1.260976
ERR1018188,1.193658,1.457986,2.798950,5.175463,1.494496,0.938207,1.750027,1.197315,1.331083,1.270434,...,-1.721541,-1.158925,-1.201397,-0.921585,-1.917436,-0.943185,-1.617676,-1.079104,-1.058664,-1.260976
ERR1018189,2.595822,1.499296,2.167253,3.315467,3.159163,2.280459,2.259239,2.128779,2.712366,2.361662,...,-1.721541,-1.158925,-1.201397,-0.921585,-1.917436,-0.943185,-1.617676,-1.079104,-1.058664,-1.260976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR8865597,1.307944,3.342941,0.862897,-0.945873,1.503585,1.064992,0.145094,3.025666,0.866657,0.042280,...,-1.721541,-1.158925,-1.201397,-0.921585,-1.917436,-0.943185,-1.617676,-1.079104,-1.058664,-1.260976
SRR8865598,0.745333,0.396456,1.906181,-1.581680,0.727375,1.164781,-0.290516,-0.700132,1.056285,0.392641,...,7.254089,10.993826,-1.201397,-0.921585,-1.917436,4.806208,2.352616,1.916629,-1.058664,-1.260976
SRR8865599,-3.377611,-3.584004,-3.453321,-1.965705,-2.749661,-1.751980,-2.575881,-3.140303,-2.067150,-2.735763,...,2.107100,-1.158925,-1.201397,-0.921585,-1.917436,3.818989,2.601832,1.405803,-1.058664,-1.260976
SRR8865600,1.599889,1.297534,3.089083,-1.400908,1.631229,2.215175,0.568061,0.022531,1.331083,1.346636,...,2.941898,-1.158925,-1.201397,-0.921585,-1.917436,-0.943185,-1.617676,-1.079104,-1.058664,-1.260976


In [45]:
meta_df=pd.read_csv("meadata_crc.tsv",sep="\t")
meta_df['Health_status'] = meta_df['Health_status'].str.strip()

meta_df['class_label'] = meta_df['Health_status'].apply(lambda x: 1 if x == 'Healthy' else 0)


meta_df

Unnamed: 0,Run_ID,BioProject,BioSample,Health_status,Phenotype,Full_Name,Sex,Age,Location,Platform,BMI,CRC stage,class_label
0,SRR30861073,PRJNA1167935,SAMN44019356,Non-Healthy,CRC,Colorectal Cancer,FEMALE,75.0,Turkey,ILLUMINA,,II,0
1,SRR30861074,PRJNA1167935,SAMN44019355,Non-Healthy,CRC,Colorectal Cancer,MALE,55.0,Turkey,ILLUMINA,,IV,0
2,SRR30861075,PRJNA1167935,SAMN44019379,Healthy,Healthy,Healthy,MALE,64.0,Turkey,ILLUMINA,,,1
3,SRR30861076,PRJNA1167935,SAMN44019377,Healthy,Healthy,Healthy,FEMALE,46.0,Turkey,ILLUMINA,,,1
4,SRR30861077,PRJNA1167935,SAMN44019374,Healthy,Healthy,Healthy,MALE,52.0,Turkey,ILLUMINA,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5139,ERR1018308,PRJEB10878,SAMEA3541592,Healthy,Healthy,Healthy,MALE,61.0,China,Illumina HiSeq 2000 platform,23.8,,1
5140,ERR1018309,PRJEB10878,SAMEA3541593,Healthy,Healthy,Healthy,MALE,62.0,China,Illumina HiSeq 2000 platform,21.5,,1
5141,ERR1018310,PRJEB10878,SAMEA3541594,Non-healthy,CRC,Colorectal Cancer,MALE,68.0,China,Illumina HiSeq 2000 platform,23.7,,0
5142,ERR1018311,PRJEB10878,SAMEA3541595,Non-healthy,CRC,Colorectal Cancer,MALE,55.0,China,Illumina HiSeq 2000 platform,25.8,,0


In [46]:
merge_df=pd.merge(clr_df,meta_df, left_index=True,right_on="Run_ID")
merge_df

Unnamed: 0,CAG-83 sp900545495,CAG-83 sp000435555,CAG-83 sp003539495,CAG-83 sp000431575,CAG-83 sp900547745,CAG-83 sp900548615,CAG-83 sp900552475,CAG-83 sp001916855,CAG-83 sp900554275,CAG-83 sp900545585,...,Health_status,Phenotype,Full_Name,Sex,Age,Location,Platform,BMI,CRC stage,class_label
5016,5.611436,4.164486,5.471456,3.752683,5.222157,4.121749,4.846377,3.510590,3.392057,3.131909,...,Non-healthy,CRC,Colorectal Cancer,MALE,64.0,China,Illumina HiSeq 2000 platform,23.1,,0
5017,1.718409,0.381296,1.406861,4.594039,1.675730,1.377180,4.549167,2.147080,2.682404,1.712754,...,Non-healthy,CRC,Colorectal Cancer,MALE,73.0,China,Illumina HiSeq 2000 platform,23.8,,0
5018,2.081344,1.250844,5.218782,1.490038,1.661483,1.590397,3.113737,4.505178,2.419121,2.109703,...,Non-healthy,CRC,Colorectal Cancer,FEMALE,67.0,China,Illumina HiSeq 2000 platform,24.7,,0
5019,1.193658,1.457986,2.798950,5.175463,1.494496,0.938207,1.750027,1.197315,1.331083,1.270434,...,Non-healthy,CRC,Colorectal Cancer,MALE,56.0,China,Illumina HiSeq 2000 platform,25.6,,0
5020,2.595822,1.499296,2.167253,3.315467,3.159163,2.280459,2.259239,2.128779,2.712366,2.361662,...,Non-healthy,CRC,Colorectal Cancer,FEMALE,59.0,China,Illumina HiSeq 2000 platform,20.7,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,1.307944,3.342941,0.862897,-0.945873,1.503585,1.064992,0.145094,3.025666,0.866657,0.042280,...,Non-Healthy,CRC,Colorectal Cancer,MALE,55.0,India,ILLUMINA,19.1,III,0
630,0.745333,0.396456,1.906181,-1.581680,0.727375,1.164781,-0.290516,-0.700132,1.056285,0.392641,...,Non-Healthy,CRC,Colorectal Cancer,MALE,62.0,India,ILLUMINA,20.1,I,0
631,-3.377611,-3.584004,-3.453321,-1.965705,-2.749661,-1.751980,-2.575881,-3.140303,-2.067150,-2.735763,...,Non-Healthy,CRC,Colorectal Cancer,FEMALE,65.0,India,ILLUMINA,21.0,III,0
628,1.599889,1.297534,3.089083,-1.400908,1.631229,2.215175,0.568061,0.022531,1.331083,1.346636,...,Non-Healthy,CRC,Colorectal Cancer,FEMALE,41.0,India,ILLUMINA,19.2,I,0


In [47]:
ml_df = merge_df.drop(columns=['Run_ID','BioProject','BioSample','Health_status','Phenotype',
                               'Full_Name','Sex','Age','Location','Platform','BMI','CRC stage'])
ml_df

Unnamed: 0,CAG-83 sp900545495,CAG-83 sp000435555,CAG-83 sp003539495,CAG-83 sp000431575,CAG-83 sp900547745,CAG-83 sp900548615,CAG-83 sp900552475,CAG-83 sp001916855,CAG-83 sp900554275,CAG-83 sp900545585,...,CAG-568 sp000434395,Zag1 sp001917115,MGYG000000581,CAG-267 sp001917135,Lentilactobacillus kefiri,Latilactobacillus curvatus,Lentilactobacillus buchneri,Campylobacter_A concisus_R,CAG-312 sp900545715,class_label
5016,5.611436,4.164486,5.471456,3.752683,5.222157,4.121749,4.846377,3.510590,3.392057,3.131909,...,-1.158925,-1.201397,-0.921585,-1.917436,-0.943185,-1.617676,-1.079104,-1.058664,-1.260976,0
5017,1.718409,0.381296,1.406861,4.594039,1.675730,1.377180,4.549167,2.147080,2.682404,1.712754,...,-1.158925,-1.201397,-0.921585,-1.917436,-0.943185,-1.617676,-1.079104,-1.058664,-1.260976,0
5018,2.081344,1.250844,5.218782,1.490038,1.661483,1.590397,3.113737,4.505178,2.419121,2.109703,...,-1.158925,-1.201397,-0.921585,-1.917436,-0.943185,-1.617676,-1.079104,-1.058664,-1.260976,0
5019,1.193658,1.457986,2.798950,5.175463,1.494496,0.938207,1.750027,1.197315,1.331083,1.270434,...,-1.158925,-1.201397,-0.921585,-1.917436,-0.943185,-1.617676,-1.079104,-1.058664,-1.260976,0
5020,2.595822,1.499296,2.167253,3.315467,3.159163,2.280459,2.259239,2.128779,2.712366,2.361662,...,-1.158925,-1.201397,-0.921585,-1.917436,-0.943185,-1.617676,-1.079104,-1.058664,-1.260976,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,1.307944,3.342941,0.862897,-0.945873,1.503585,1.064992,0.145094,3.025666,0.866657,0.042280,...,-1.158925,-1.201397,-0.921585,-1.917436,-0.943185,-1.617676,-1.079104,-1.058664,-1.260976,0
630,0.745333,0.396456,1.906181,-1.581680,0.727375,1.164781,-0.290516,-0.700132,1.056285,0.392641,...,10.993826,-1.201397,-0.921585,-1.917436,4.806208,2.352616,1.916629,-1.058664,-1.260976,0
631,-3.377611,-3.584004,-3.453321,-1.965705,-2.749661,-1.751980,-2.575881,-3.140303,-2.067150,-2.735763,...,-1.158925,-1.201397,-0.921585,-1.917436,3.818989,2.601832,1.405803,-1.058664,-1.260976,0
628,1.599889,1.297534,3.089083,-1.400908,1.631229,2.215175,0.568061,0.022531,1.331083,1.346636,...,-1.158925,-1.201397,-0.921585,-1.917436,-0.943185,-1.617676,-1.079104,-1.058664,-1.260976,0


In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# --- Step 1: Prepare X and y ---
X = ml_df.drop(columns=['class_label']).values  # features
y = ml_df['class_label'].values                 # labels

# --- Step 2: Split into training and test sets (80/20) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Step 3: Train SVM-RBF model without scaling ---
svm_model = SVC(kernel='rbf', C=0.01, random_state=42)
svm_model.fit(X_train, y_train)

# --- Step 4: Evaluate on test set ---
test_score = svm_model.score(X_test, y_test)
print(f"SVM-RBF Test Accuracy: {test_score:.4f}")


SVM-RBF Test Accuracy: 0.6218
