In [1]:
cd "C:\Users\Gwoncheol\Desktop\"

C:\Users\Gwoncheol\Desktop


In [17]:
import pandas as pd

df = pd.read_csv('data-diabetes.csv')
df.head(1)

Unnamed: 0,ID,Group,Age,Sex,BMI,SBP,DBP,FBS,HbA1C,Cholesterol,...,Libanicoccus,f_Prevotellaceae_g_uncultured,Gastranaerophilales,Parabacteroides,Family_XIII_AD3011_group,f_Oscillospiraceae_g_UCG_005,Alistipes,f_Ruminococcaceae_g_uncultured,o_Oscillospirales_g_UCG_010,Monoglobus
0,M_004,Known_Diabetics,63,M,29.366417,143,90,139,8.1,256.0,...,0.0,0.0,0.015701,0.0,0.000714,0.0,0.000357,0.0,0.000306,0.002906


In [19]:
import statsmodels.formula.api as smf

# Set 'Group' as categorical with 'Control' as reference
df['Group'] = pd.Categorical(df['Group'], categories=['Control', 'Pre_Diabetics', 'Known_Diabetics', 'Unknown_Diabetics'])

# Define the columns that are NOT outcome variables (predictors/identifiers)
non_outcome_cols = ['ID', 'Group', 'Age', 'Sex', 'BMI']

# Define the base formula for the predictors (the adjustment/confounding factors)
base_predictors = 'C(Group, Treatment(reference="Control")) + Age + BMI + Sex'

# 2. Identify the outcome columns
outcome_cols = [col for col in df.columns if col not in non_outcome_cols]

# Dictionary to store results for all models
all_results = {}

# Helper function logic to extract key results as a DataFrame
def extract_results(model, outcome_name):
    # Extract the coefficients table (index 1 of the summary tables)
    html_table = model.summary().tables[1].as_html()
    
    # Read the HTML into a pandas DataFrame, setting the first column as the index
    df_table = pd.read_html(html_table, header=0, index_col=0)[0]
    
    # Select only Coefficient and P>|t| columns and rename them
    df_results = df_table[['coef', 'P>|t|']].copy()
    df_results.columns = [f'{outcome_name} Coef', f'{outcome_name} P>|t|']
    
    return df_results

# 3. Iterate and model for all identified outcome columns
print("Starting regression analysis for outcomes:", outcome_cols)

for outcome in outcome_cols:
    # Construct the full formula dynamically
    formula = f'{outcome} ~ {base_predictors}'
    
    # Fit the OLS model
    model = smf.ols(formula, data=df).fit()
    
    # Extract and store the results
    results_df = extract_results(model, outcome)
    all_results[outcome] = results_df

# 4. Consolidate results: Merge all results into a single DataFrame
final_summary_df = None
for outcome, df_results in all_results.items():
    if final_summary_df is None:
        final_summary_df = df_results
    else:
        final_summary_df = final_summary_df.merge(df_results, left_index=True, right_index=True, how='outer')

# Rename the index for clarity
final_summary_df.index.name = 'Predictor'

# 5. Save to CSV
output_filename = 'regression_summary_auto_detected.csv'
final_summary_df.to_csv(output_filename)

print(f"\nAnalysis complete. Results saved to {output_filename}")
print(f"Summary DataFrame created. First 5 rows:\n{final_summary_df.head()}")

Starting regression analysis for outcomes: ['SBP', 'DBP', 'FBS', 'HbA1C', 'Cholesterol', 'Triglyceride', 'Serum_Creatinine', 'Insulin', 'Height', 'Weight', 'Waist', 'Hip', 'Waist_to_Hip_Ratio', 'QUICKI', 'HOMA_IR', 'Lactobacillus', 'Bifidobacterium', 'Collinsella', 'Dorea', 'Faecalibacterium', 'Prevotella', 'Agathobacter', 'Blautia', 'Catenibacterium', 'Streptococcus', '_Eubacterium__hallii_group', 'Dialister', 'Holdemanella', 'Subdoligranulum', 'Ruminococcus', '_Eubacterium__coprostanoligenes_group', 'Coprococcus', 'Olsenella', 'Roseburia', '_Ruminococcus__torques_group', 'RF39', 'Fusicatenibacter', '_Ruminococcus__gauvreauii_group', 'Clostridium_sensu_stricto_1', 'Megasphaera', 'Bacteroides', 'Escherichia_Shigella', 'Romboutsia', 'Alloprevotella', 'Clostridia_UCG_014', 'Christensenellaceae_R_7_group', 'Senegalimassilia', 'Rikenellaceae_RC9_gut_group', 'Muribaculaceae', 'Lachnospiraceae_NK4A136_group', 'Butyricicoccus', 'Mitsuokella', 'f_Oscillospiraceae_g_UCG_002', 'f_Eggerthellaceae