#### <b> Import libraries

In [1]:
import pandas as pd
import os
import numpy as np

#### <b> Merge, cleanup and transformation for bio.csv and gut_health_test.csv

In [3]:
# Read the CSV bio and gut_health_test csv files

main_folder = r'C:/Data_BVS/DATASETS/CGMacros/'

bio_df = pd.read_csv(os.path.join(main_folder, 'bio.csv'))
guthealth_df = pd.read_csv(os.path.join(main_folder, 'gut_health_test.csv'))

#### <b> Merge

In [4]:
merged_bio_gutHealth_df = pd.merge(bio_df, guthealth_df, on='subject', how='inner')

In [5]:
# Clean column names: remove spaces
merged_bio_gutHealth_df.columns = merged_bio_gutHealth_df.columns.str.strip()

#### <b> Rename columns for better understanding

In [6]:
merged_bio_gutHealth_df.rename(columns={'subject': 'ParticipantID',
                                        'Body weight' : 'Weight',
                                        'Self-identify': 'Ethnicity',
                                        'A1c PDL (Lab)': 'HbA1c',
                                        'Fasting GLU - PDL (Lab)': 'Fasting_Glucose',
                                        'Non HDL' : 'Non_HDL',
                                        'LDL (Cal)': 'LDL',
                                        'VLDL (Cal)': 'VLDL',
                                        'Cho/HDL Ratio' : 'Cho/HDL_Ratio',
                                        'Collection time PDL (Lab)': 'Fasting_Lab_Collection_Time',
                                        '#1 Contour Fingerstick GLU': 'Fingerstick_Glucose_1',
                                        'Time (t)': 'Fingerstick_Glucose_1_Time',
                                        '#2 Contour Fingerstick GLU': 'Fingerstick_Glucose_2',
                                        'Time (t).1': 'Fingerstick_Glucose_2_Time',
                                        '#3 Contour Fingerstick GLU': 'Fingerstick_Glucose_3',
                                        'Time (t).2': 'Fingerstick_Glucose_3_Time',
                                        'Gut Lining Health' : 'Gut_Lining_Health',
                                        'LPS Biosynthesis Pathways' : 'LPS_Biosynthesis_Pathways',
                                        'Biofilm, Chemotaxis, and Virulence Pathways' : 'Biofilm_Chemotaxis_and_Virulence_Pathways',
                                        'TMA Production Pathways' : 'TMA_Production_Pathways',
                                        'Ammonia Production Pathways' : 'Ammonia_Production_Pathways',
                                        'Metabolic Fitness' : 'Metabolic_Fitness',
                                        'Active Microbial Diversity' : 'Active_Microbial_Diversity',
                                        'Butyrate Production Pathways' : 'Butyrate_Production_Pathways',
                                        'Flagellar Assembly Pathways' : 'Flagellar_Assembly_Pathways',
                                        'Putrescine Production Pathways' : 'Putrescine_Production_Pathways',
                                        'Uric Acid Production Pathways' : 'Uric_Acid_Production_Pathways',
                                        'Bile Acid Metabolism Pathways' : 'Bile_Acid_Metabolism_Pathways',
                                        'Inflammatory Activity' : 'Inflammatory_Activity',
                                        'Gut Microbiome Health' : 'Gut_Microbiome_Health',
                                        'Digestive Efficiency' : 'Digestive_Efficiency',
                                        'Protein Fermentation' : 'Protein_Fermentation',
                                        'Gas Production' : 'Gas_Production',
                                        'Methane Gas Production Pathways' : 'Methane_Gas_Production_Pathways',
                                        'Sulfide Gas Production Pathways' : 'Sulfide_Gas_Production_Pathways',
                                        'Oxalate Metabolism Pathways' : 'Oxalate_Metabolism_Pathways',
                                        'Salt Stress Pathways' : 'Salt_Stress_Pathways',
                                        'Microbiome-Induced Stress' : 'Microbiome_Induced_Stress'
                                       }, inplace=True)


#### <b> Column Transformations:

In [7]:
#BMI - round to two decimal points:
merged_bio_gutHealth_df['BMI'] = merged_bio_gutHealth_df['BMI'].round(2)

# Categorize BMI: Underweight(< 18.5) , Normal(18.5 - 24.9) , Overweight (25.0 - 29.9), Obese (>=30.0)
# Define the BMI bins
bmi_bins = [0, 18.5, 24.9, 29.9, float('inf')]
bmi_labels = ['Underweight', 'Normal', 'Overweight', 'Obese']
# Apply categorization 
merged_bio_gutHealth_df['BMI_Classification'] = pd.cut(merged_bio_gutHealth_df['BMI'],
                                                        bins=bmi_bins,
                                                        labels=bmi_labels,
                                                        right=True,
                                                        include_lowest=True)


# Categorize the HbA1c levels: Normal (<5.7), Prediabetes (5.7-6.4), Diabetes (>6.4)
# Define the HbA1c bins 
hba1c_bins = [0, 5.6, 6.4, float('inf')]  
hba1c_labels = ['Normal', 'Prediabetes', 'Diabetes']
# Apply categorization
merged_bio_gutHealth_df['HbA1c_Classification'] = pd.cut(merged_bio_gutHealth_df['HbA1c'], 
                                                  bins=hba1c_bins, 
                                                  labels=hba1c_labels, 
                                                  right=True, 
                                                  include_lowest=True)


# Categorize the fasting glucose levels (mg/dL): Normal (<100), Prediabetes (100-125), Diabetes (>= 126)
# Define fasting glucose level bins 
glucose_bins = [0, 99, 125, float('inf')]
glucose_labels = ['Normal', 'Prediabetes', 'Diabetes']
# Apply categorization
merged_bio_gutHealth_df['Fasting_Glucose_Classification'] = pd.cut(merged_bio_gutHealth_df['Fasting_Glucose'],
                                                                    bins=glucose_bins,
                                                                    labels=glucose_labels,
                                                                    right=True,
                                                                    include_lowest=True)



#Fasting Lab Collection Time - change format from HH:MM:SS AM/PM to HH:MM (24 hour) 
merged_bio_gutHealth_df['Fasting_Lab_Collection_Time'] = pd.to_datetime(
    merged_bio_gutHealth_df['Fasting_Lab_Collection_Time'], format='mixed', errors='coerce').dt.strftime('%H:%M')

#Add new column for calculating LDL with:  LDL = Cholesterol - HDL - (Triglycerides / 5)
merged_bio_gutHealth_df['LDL_Calculated'] = merged_bio_gutHealth_df['Cholesterol'] - merged_bio_gutHealth_df['HDL'] - (merged_bio_gutHealth_df['Triglycerides'] / 5)
merged_bio_gutHealth_df['LDL_Calculated'] = merged_bio_gutHealth_df['LDL_Calculated'].round().astype('Int64')

#Add new column for calculating VLDL with: VLDL = Triglycerides / 5
merged_bio_gutHealth_df['VLDL_Calculated'] = (merged_bio_gutHealth_df['Triglycerides'] / 5)
merged_bio_gutHealth_df['VLDL_Calculated'] = merged_bio_gutHealth_df['VLDL_Calculated'].round().astype('Int64')

# Compute median values for LDL_Calculated and VLDL_Calculated
ldl_median = merged_bio_gutHealth_df['LDL_Calculated'].median().round()
vldl_median = merged_bio_gutHealth_df['VLDL_Calculated'].median().round()

# Impute with these median values for ParticipantID = 12, since they are erroneous as per the data dictionary
merged_bio_gutHealth_df.loc[merged_bio_gutHealth_df['ParticipantID'] == 12, 'LDL_Calculated'] = ldl_median
merged_bio_gutHealth_df.loc[merged_bio_gutHealth_df['ParticipantID'] == 12, 'VLDL_Calculated'] = vldl_median


#Add new column for calculating Cho/HDL Ratio with: Cho/HDL Ratio = Choloestrol / HDL 
merged_bio_gutHealth_df['Cho/HDL_Ratio_Calculated'] = (merged_bio_gutHealth_df['Cholesterol'] / merged_bio_gutHealth_df['HDL'])
merged_bio_gutHealth_df['Cho/HDL_Ratio_Calculated'] = merged_bio_gutHealth_df['Cho/HDL_Ratio_Calculated'].round(1)

#Add new columns for description for Gut Health scores (map values)
gut_health_score_mapping = {
    1: 'Not Optimal',
    2: 'Average',
    3: 'Good'
}

gut_health_cols_to_map = ['Gut_Lining_Health', 'LPS_Biosynthesis_Pathways','Biofilm_Chemotaxis_and_Virulence_Pathways',
                            'TMA_Production_Pathways','Ammonia_Production_Pathways','Metabolic_Fitness','Active_Microbial_Diversity',
                            'Butyrate_Production_Pathways','Flagellar_Assembly_Pathways','Putrescine_Production_Pathways',
                            'Uric_Acid_Production_Pathways','Bile_Acid_Metabolism_Pathways','Inflammatory_Activity',
                            'Gut_Microbiome_Health','Digestive_Efficiency','Protein_Fermentation','Gas_Production',
                            'Methane_Gas_Production_Pathways','Sulfide_Gas_Production_Pathways','Oxalate_Metabolism_Pathways',
                            'Salt_Stress_Pathways','Microbiome_Induced_Stress']

for col in gut_health_cols_to_map:
    merged_bio_gutHealth_df[f'{col}_Desc'] = merged_bio_gutHealth_df[col].replace(gut_health_score_mapping)


#### <b> Cleanup and transformation for microbes.csv

In [8]:
# We determined the health impact category (Good, Moderate, Bad) for the microbes in the given microbes.csv
# Created a custom csv with microbe name and health impact category - Microbes_Categorized_as_Good_Moderate_Bad.csv
# Then we calculated the count of each of these categories per participant, and merged with the bio and gut_health data.

In [14]:
import pandas as pd

# Read the microbes and Microbes_Categorized_as_Good_Moderate_Bad CSV file
microbes_df = pd.read_csv(os.path.join(main_folder, 'microbes.csv'))
health_impact_df = pd.read_csv(os.path.join(main_folder, 'Microbes_Categorized_as_Good_Moderate_Bad.csv'))


# Clean column names: remove spaces
microbes_df.columns = microbes_df.columns.str.strip()
health_impact_df.columns = health_impact_df.columns.str.strip()

# Rename subject column to ParticipantID
microbes_df.rename(columns={"subject": "ParticipantID"}, inplace=True)

# Get the microbe columns only (excluding the ParticipantID)
microbe_columns = [col for col in microbes_df.columns if col != 'ParticipantID']


# Create microbe and health impact category dictionary
microbe_to_impact = dict(zip(
    health_impact_df['Microbe'].str.strip(),
    health_impact_df['Health_Impact_Category'].str.strip()
))

# Divide microbe columns by health impact category
good_microbes = [microbe for microbe in microbe_columns if microbe_to_impact.get(microbe) == 'Good']
moderate_microbes = [microbe for microbe in microbe_columns if microbe_to_impact.get(microbe) == 'Moderate']
bad_microbes = [microbe for microbe in microbe_columns if microbe_to_impact.get(microbe) == 'Bad']

# Calculate per-participant health impact category counts
participant_microbes_health_impact_df = pd.DataFrame()
participant_microbes_health_impact_df['ParticipantID'] = microbes_df['ParticipantID']
participant_microbes_health_impact_df['Good_Microbes_Count'] = microbes_df[good_microbes].eq(1).sum(axis=1)
participant_microbes_health_impact_df['Moderate_Microbes_Count'] = microbes_df[moderate_microbes].eq(1).sum(axis=1)
participant_microbes_health_impact_df['Bad_Microbes_Count'] = microbes_df[bad_microbes].eq(1).sum(axis=1)

participant_microbes_health_impact_df

Unnamed: 0,ParticipantID,Good_Microbes_Count,Moderate_Microbes_Count,Bad_Microbes_Count
0,1,87,435,15
1,2,42,377,30
2,3,62,409,27
3,4,53,328,26
4,5,23,292,27
5,6,33,305,17
6,7,50,323,23
7,8,38,439,24
8,9,44,286,10
9,10,32,248,6


#### <b> Merge the cleaned/transformed microbes data with cleaned/transformed bio and gut health data.

In [15]:
merged_bio_gutHealth_microbes_df = pd.merge(merged_bio_gutHealth_df, participant_microbes_health_impact_df, on='ParticipantID', how='inner')

#### <b> Get the merged, cleaned and transformed data for bio.csv, gut_health_test.csv and microbes.csv

In [16]:
column_order = ['ParticipantID', 'Age', 'Gender', 'BMI', 'BMI_Classification', 'Weight', 'Height', 'Ethnicity', 'HbA1c',
                'HbA1c_Classification','Fasting_Glucose', 'Fasting_Glucose_Classification','Insulin', 'Triglycerides', 'Cholesterol',
                'HDL', 'Non_HDL', 'LDL','LDL_Calculated', 'VLDL', 'VLDL_Calculated','Cho/HDL_Ratio', 'Cho/HDL_Ratio_Calculated',
                'Fasting_Lab_Collection_Time', 'Fingerstick_Glucose_1', 'Fingerstick_Glucose_1_Time', 
                'Fingerstick_Glucose_2', 'Fingerstick_Glucose_2_Time', 'Fingerstick_Glucose_3', 'Fingerstick_Glucose_3_Time',
                'Gut_Lining_Health', 'Gut_Lining_Health_Desc','LPS_Biosynthesis_Pathways','LPS_Biosynthesis_Pathways_Desc', 
                'Biofilm_Chemotaxis_and_Virulence_Pathways', 'Biofilm_Chemotaxis_and_Virulence_Pathways_Desc', 
                'TMA_Production_Pathways', 'TMA_Production_Pathways_Desc', 'Ammonia_Production_Pathways',
                'Ammonia_Production_Pathways_Desc','Metabolic_Fitness', 'Metabolic_Fitness_Desc', 
                'Active_Microbial_Diversity', 'Active_Microbial_Diversity_Desc', 'Butyrate_Production_Pathways', 
                'Butyrate_Production_Pathways_Desc','Flagellar_Assembly_Pathways', 'Flagellar_Assembly_Pathways_Desc',
                'Putrescine_Production_Pathways','Putrescine_Production_Pathways_Desc', 'Uric_Acid_Production_Pathways', 
                'Uric_Acid_Production_Pathways_Desc','Bile_Acid_Metabolism_Pathways', 'Bile_Acid_Metabolism_Pathways_Desc',
                'Inflammatory_Activity','Inflammatory_Activity_Desc', 'Gut_Microbiome_Health', 'Gut_Microbiome_Health_Desc',
                'Digestive_Efficiency', 'Digestive_Efficiency_Desc', 'Protein_Fermentation', 'Protein_Fermentation_Desc',
                'Gas_Production', 'Gas_Production_Desc', 'Methane_Gas_Production_Pathways',  'Methane_Gas_Production_Pathways_Desc', 
                'Sulfide_Gas_Production_Pathways', 'Sulfide_Gas_Production_Pathways_Desc','Oxalate_Metabolism_Pathways',
                'Oxalate_Metabolism_Pathways_Desc', 'Salt_Stress_Pathways', 'Salt_Stress_Pathways_Desc', 
                'Microbiome_Induced_Stress','Microbiome_Induced_Stress_Desc','Good_Microbes_Count','Moderate_Microbes_Count', 
                'Bad_Microbes_Count'
               ] 

# Save the final results to CSV
output_file = '02DataExplorers_Cleaned_MergedBioGuthealthMicrobesData.csv'
merged_bio_gutHealth_microbes_df.to_csv('02DataExplorers_Cleaned_MergedBioGuthealthMicrobesData.csv', columns=column_order, index=False)
print(f"Merged/Cleaned/Transformed bio, guthealth, microbes data saved to {output_file}")

Merged/Cleaned/Transformed bio, guthealth, microbes data saved to 02DataExplorers_Cleaned_MergedBioGuthealthMicrobesData.csv
