#### The data_download.py script downloads NHANES datasets for demographics and body measures, then converts them to CSV files. We then use these datasets in the Jupyter notebook to train a simple model and track experiments with MLflow.

#### Import Libraries and Initialize MLflow

In [4]:
# Install required libraries using conda
# !conda install -c conda-forge pandas=2.0.3 numpy=1.24.3 matplotlib=3.7.1 seaborn=0.12.2 sqlalchemy=2.0.19 dash=2.11.1 plotly=5.15.0 jupyter=1.0.0 scikit-learn=1.3.0 mlflow=2.7.1 -y

Channels:
 - conda-forge
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [1]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

mlflow.set_experiment("NHANES Experiment")

2024/12/06 11:40:41 INFO mlflow.tracking.fluent: Experiment with name 'NHANES Experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/Arthur/CODE/DataProjects/medical_data_analysis/mlruns/140641251031879809', creation_time=1733478041724, experiment_id='140641251031879809', last_update_time=1733478041724, lifecycle_stage='active', name='NHANES Experiment', tags={}>

#### Load and Preprocess Data

In [2]:
demo_data = pd.read_csv('data/demographics.csv')
body_measures = pd.read_csv('data/body_measures.csv')

# Merge datasets on a common column, e.g., 'SEQN'
data = pd.merge(demo_data, body_measures, on='SEQN')

# Example preprocessing: fill missing values, encode categorical variables, etc.
data.fillna(data.mean(), inplace=True)
data = pd.get_dummies(data, drop_first=True)

# Define features and target
X = data.drop('target_column', axis=1)  # Replace 'target_column' with actual target column
y = data['target_column']

KeyError: "['target_column'] not found in axis"

#### Train Model and Track with MLflow

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

with mlflow.start_run():
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)
    acc = accuracy_score(y_test, predictions)

    mlflow.log_param("n_estimators", 100)
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(model, "model")

    print(f"Accuracy: {acc}")

#### Visualize Results

In [None]:
cm = confusion_matrix(y_test, predictions)
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()