# Data Loading and Setup

## Import Libraries

In [1]:
# Data manipulation
import os
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical analysis
import statsmodels.api as sm
from scipy import stats

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    confusion_matrix,
    roc_curve,
)

# Settings
%matplotlib inline
sns.set_style('whitegrid')

# Add src to the system path to access utility functions
import sys
import logging
from pathlib import Path
sys.path.append(str(Path().resolve().parent / 'src'))

# Import your custom functions from data_utils.py
from data.data_utils import load_data, setup_logging

## Configuration/Setup

In [None]:
# Set up logging
setup_logging("INFO")

# Define file path and columns
raw_data_path = '../data/raw/test_project_data.csv'
category_columns = ['subject_id', 'sex']
numeric_columns = ['age', 'BMI', 'prevalent_diabetes', 'incident_diabetes', 'diabetes_followup_time']
biomarker_pattern = '^mtb_'

## Load Dataset

In [2]:
# import re
# import os
# import logging
# import pandas as pd
# import numpy as np
# from pathlib import Path

# # Set up logging in a notebook-friendly way
# def setup_logging(level="INFO"):
#     """Set up logging configuration for Jupyter Notebook."""
#     numeric_level = getattr(logging, level.upper(), None)
#     if not isinstance(numeric_level, int):
#         raise ValueError(f"Invalid log level: {level}")
    
#     logger = logging.getLogger()
#     if not logger.hasHandlers():
#         logging.basicConfig(level=numeric_level, format='%(asctime)s - %(levelname)s - %(message)s')
#     else:
#         logger.setLevel(numeric_level)

# # Check if the data file exists
# def check_file_exists(file_path):
#     """Raise an error if the file does not exist."""
#     if not file_path.exists():
#         raise FileNotFoundError(f"Data file not found at {file_path}.")

# # Get data types for columns
# def get_dtypes(category_columns, numeric_columns, biomarker_cols):
#     """
#     Define the data types for each column in the dataset.
    
#     Parameters:
#     category_columns (list): List of categorical column names
#     numeric_columns (list): List of numeric column names
#     biomarker_cols (list): List of biomarker column names

#     Returns:
#     dict: A dictionary mapping column names to data types
#     """
#     dtypes = {col: 'category' for col in category_columns}
#     dtypes.update({col: np.float64 for col in numeric_columns})
#     for col in biomarker_cols:
#         dtypes[col] = np.float64
#     return dtypes

# # Load data with specific column patterns and types
# def load_data(raw_data_path, category_columns, numeric_columns, biomarker_pattern):
#     """
#     Load dataset based on column patterns and types.
    
#     Parameters:
#     raw_data_path (str): Path to the raw data CSV file
#     category_columns (list): List of categorical column names
#     numeric_columns (list): List of numeric column names
#     biomarker_pattern (str): Regex pattern to identify biomarker columns

#     Returns:
#     pd.DataFrame: Loaded dataset
#     """
#     # Check if the file exists
#     check_file_exists(Path(raw_data_path))

#     # Read only the headers (column names) using readline for speed
#     with open(raw_data_path, 'r') as file:
#         first_line = file.readline().strip()

#     # Split the first line by comma (or the appropriate delimiter)
#     column_names = first_line.split(',')

#     # Create a list of biomarker columns based on the pattern
#     biomarker_cols = [col for col in column_names if re.search(biomarker_pattern, col)]

#     # Define data types for the dataset
#     dtypes = get_dtypes(category_columns, numeric_columns, biomarker_cols)

#     # Load the full dataset with the specified data types
#     raw_df = pd.read_csv(raw_data_path, dtype=dtypes, engine='pyarrow')

#     logging.info(f"Dataset shape: {raw_df.shape}")

#     return raw_df

In [5]:
# Load data
raw_df = load_data(raw_data_path, category_columns, numeric_columns, biomarker_pattern)

# Display the first few rows of the dataset
logging.info("Dataset preview:")
raw_df.head()

2024-09-17 23:08:56,484 - INFO - Dataset shape: (8291, 10007)
2024-09-17 23:08:56,500 - INFO - Dataset preview:


Unnamed: 0,subject_id,mtb_0018261,mtb_0018266,mtb_0018325,mtb_0018326,mtb_0018327,mtb_0018351,mtb_0018362,mtb_0018470,mtb_0018509,...,mtb_2127305,mtb_2129124,mtb_2129210,mtb_2129554,BMI,age,sex,prevalent_diabetes,incident_diabetes,diabetes_followup_time
0,sbj_0000,5885.011,15177.95,19653.31,12611.67,9042.063,,60124.56,9552.881,,...,,,22112.25,,18.664268,33.81,male,0.0,0.0,14.77
1,sbj_0001,7624.425,10901.94,12129.36,9336.46,21680.93,,83700.8,15068.12,,...,,,,,28.175977,68.56,male,0.0,0.0,14.77
2,sbj_0002,7220.4,18086.69,23707.55,7344.697,26314.65,,82482.38,13911.3,,...,15185.79,,19425.35,,22.971959,55.68,male,0.0,0.0,14.77
3,sbj_0003,6797.486,12364.85,17775.76,25836.15,30563.14,,92839.12,11690.13,,...,,,,,21.96037,43.89,female,0.0,0.0,14.78
4,sbj_0004,,19046.33,18701.06,14137.27,27020.92,,85453.69,9579.896,,...,5318.547,,,,40.454949,47.76,male,0.0,0.0,14.86


# Data Preprocessing

## Data Overview

In [4]:
# Calculate missingness rate for each column
missingness_rate = raw_df.isnull().mean().sort_values(ascending=False).reset_index()
missingness_rate.columns = ['column', 'missing_rate']

# Filter columns with >30% missingness
filtered_missingness = missingness_rate[missingness_rate['missing_rate'] > 0.30]

# Display or return the filtered missingness DataFrame
filtered_missingness

Unnamed: 0,column,missing_rate
0,mtb_0142579,0.994452
1,mtb_1385957,0.994211
2,mtb_1422699,0.992763
3,mtb_1068441,0.992763
4,mtb_0889532,0.992643
...,...,...
6377,mtb_0634601,0.301049
6378,mtb_1807948,0.300808
6379,mtb_1855856,0.300687
6380,mtb_1561131,0.300326


In [5]:
# missing_corr = raw_df.isnull().corr()
# sns.heatmap(missing_corr, cmap="coolwarm", annot=True)

In [None]:
import missingno as msno
msno.matrix(raw_df)
msno.heatmap(raw_df)

## Handle Missing Values

## Encode Categorical Variables

In [None]:
raw_df['sex'] = raw_df['sex'].map({'male': 1, 'female': 0})

## Identify Biomarker Columns


##  Handle Biomarker Missing Values

## Transform Biomarkers

# Exploratory Data Analysis (EDA)

## Demographic Distributions

In [None]:
# Age Distribution

In [None]:
# BMI Distribution

In [None]:
# Sex Distribution

## Outcome Variable Analysis

# Statistical Analysis

## Univariate Logistic Regression for Each Biomarker

## Multiple Testing Correction

## Identify Significant Biomarkers

# Visualization of Significant Biomarkers

## Plot Top Significant Biomarkers

# Machine Learning Model

## Feature Selection

## Data Splitting

## Model Training

## Model Evaluation

### ROC AUC Score

### Classification Report

### Confusion Matrix

## ROC Curve

# Discussion

## Interpretation of Significant Biomarkers

## Model Performance

## Limitations

# Conclusion

# References