# Data Loading

## Import Libraries

In [2]:
# Data manipulation
import os
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical analysis
import statsmodels.api as sm
from scipy import stats

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    confusion_matrix,
    roc_curve,
)

# Settings
%matplotlib inline
sns.set_style('whitegrid')

## Load Dataset

In [4]:
# Define data path
raw_data_fname = 'test_project_data.csv'
raw_data_dir   = 'data/raw'
raw_data_path = f'../{raw_data_dir}/{raw_data_fname}'

# Check if data file exists
if not os.path.exists(raw_data_path):
    raise FileNotFoundError(
        f"Data file not found at {raw_data_path}. Please place '{raw_data_fname}' in the '{raw_data_dir}/' directory."
    )

In [17]:
# Read only the headers (column names)
with open(raw_data_path, 'r') as file:
    first_line = file.readline().strip()

# Split the first line by comma (or the appropriate delimiter)
column_names = first_line.split(',')

# Create a list of biomarker columns
biomarker_cols = [col for col in column_names if col.startswith('mtb_')]

# Define data types
dtypes = {
    'subject_id': 'category',
    'sex': 'category',
    'prevalent_diabetes': np.float64,  # Change this to float to handle NaN values
    'incident_diabetes': np.float64,   # Similarly for incident_diabetes
    'age': np.float64,
    'BMI': np.float64,
    'diabetes_followup_time': np.float64
}

# Assign float64 data type to all biomarker columns
for col in biomarker_cols:
    dtypes[col] = np.float64

# Load the dataset with specified data types
raw_df = pd.read_csv(raw_data_path, dtype=dtypes, engine='pyarrow')

# Display basic information
print(f"Dataset shape: {raw_df.shape}")
raw_df.head()

Dataset shape: (8291, 10007)


Unnamed: 0,subject_id,mtb_0018261,mtb_0018266,mtb_0018325,mtb_0018326,mtb_0018327,mtb_0018351,mtb_0018362,mtb_0018470,mtb_0018509,...,mtb_2127305,mtb_2129124,mtb_2129210,mtb_2129554,BMI,age,sex,prevalent_diabetes,incident_diabetes,diabetes_followup_time
0,sbj_0000,5885.011,15177.95,19653.31,12611.67,9042.063,,60124.56,9552.881,,...,,,22112.25,,18.664268,33.81,male,0.0,0.0,14.77
1,sbj_0001,7624.425,10901.94,12129.36,9336.46,21680.93,,83700.8,15068.12,,...,,,,,28.175977,68.56,male,0.0,0.0,14.77
2,sbj_0002,7220.4,18086.69,23707.55,7344.697,26314.65,,82482.38,13911.3,,...,15185.79,,19425.35,,22.971959,55.68,male,0.0,0.0,14.77
3,sbj_0003,6797.486,12364.85,17775.76,25836.15,30563.14,,92839.12,11690.13,,...,,,,,21.96037,43.89,female,0.0,0.0,14.78
4,sbj_0004,,19046.33,18701.06,14137.27,27020.92,,85453.69,9579.896,,...,5318.547,,,,40.454949,47.76,male,0.0,0.0,14.86


# Data Preprocessing

## Data Overview

## Handle Missing Values

## Encode Categorical Variables

## Identify Biomarker Columns


##  Handle Biomarker Missing Values

## Transform Biomarkers

# Exploratory Data Analysis (EDA)

## Demographic Distributions

In [None]:
# Age Distribution

In [None]:
# BMI Distribution

In [None]:
# Sex Distribution

## Outcome Variable Analysis

# Statistical Analysis

## Univariate Logistic Regression for Each Biomarker

## Multiple Testing Correction

## Identify Significant Biomarkers

# Visualization of Significant Biomarkers

## Plot Top Significant Biomarkers

# Machine Learning Model

## Feature Selection

## Data Splitting

## Model Training

## Model Evaluation

### ROC AUC Score

### Classification Report

### Confusion Matrix

## ROC Curve

# Discussion

## Interpretation of Significant Biomarkers

## Model Performance

## Limitations

# Conclusion

# References