In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer

In [3]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """
    # YOUR CODE HERE
    # Load the CSV file using pandas
    df = pd.read_csv(file_path)
    
    return df

In [4]:
def prepare_data_part1(df, test_size=0.2, random_state=42):
    """
    Prepare data for modeling: select features, split into train/test sets, handle missing values.
    
    Args:
        df: Input DataFrame
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    # YOUR CODE HERE
    # 1. Select relevant features (age, systolic_bp, diastolic_bp, glucose_level, bmi)
    # 2. Select target variable (disease_outcome)
    # 3. Split data into training and testing sets
    # 4. Handle missing values using SimpleImputer
    
    X = df[['age', 'systolic_bp', 'diastolic_bp', 'glucose_level', 'bmi']]
    y = df['disease_outcome']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.3, random_state = 21
    )
    
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.fit_transform(X_test)
    
    # Placeholder return - replace with your implementation
    return X_train, X_test, y_train, y_test

In [5]:
def train_logistic_regression(X_train, y_train):
    """
    Train a logistic regression model.
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Trained logistic regression model
    """
    # YOUR CODE HERE
    # Initialize and train a LogisticRegression model
    model = LogisticRegression().fit(X_train, y_train)

    return model 

In [6]:
def calculate_evaluation_metrics(model, X_test, y_test):
    """
    Calculate classification evaluation metrics.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target
        
    Returns:
        Dictionary containing accuracy, precision, recall, f1, auc, and confusion_matrix
    """
    # YOUR CODE HERE
    # 1. Generate predictions
    # 2. Calculate metrics: accuracy, precision, recall, f1, auc
    # 3. Create confusion matrix
    # 4. Return metrics in a dictionary
    
    # Placeholder return - replace with your implementation

    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred) 
    cm = confusion_matrix(y_test, y_pred)
    
#     print("Accuracy:", accuracy_score(y_test, y_pred))
#     print("Precision:", precision_score(y_test, y_pred))
#     print("Recall:", recall_score(y_test, y_pred))
#     print("F1 Score:", f1)
#     print("AUC:", auc)
#     print("Confusion Matrix", cm)
    
    results = {"accuracy": acc, "precision": prec, "recall": recall, "f1": f1, "auc": auc, "confusion_matrix": cm}
    
    return results

In [7]:
def interpret_results(metrics):
    """
    Analyze model performance on imbalanced data.
    
    Args:
        metrics: Dictionary containing evaluation metrics
        
    Returns:
        Dictionary with keys:
        - 'best_metric': Name of the metric that performed best
        - 'worst_metric': Name of the metric that performed worst
        - 'imbalance_impact_score': A score from 0-1 indicating how much
          the class imbalance affected results (0=no impact, 1=severe impact)
    """
    # YOUR CODE HERE
    # 1. Determine which metric performed best and worst
    # 2. Calculate an imbalance impact score based on the difference
    #    between accuracy and more imbalance-sensitive metrics like F1 or recall
    # 3. Return the results as a dictionary
    best = -1
    best_metric = None
    
    worst = 99999
    worst_metric = None
    
    for key, value in metrics.items():
        if value.dtype == np.float64: # don't check confusion matrix
            if value > best: # check for best
                best_metric = key
                best = value

            if value < worst: # check for worst
                worst_metric = key
                worst = value
            
    imbalance_score = best - worst # calculate imbalance
    
    # Placeholder return - replace with your implementation
    return {
        'best_metric': best_metric,
        'worst_metric': worst_metric,
        'imbalance_impact_score': imbalance_score
    }

In [8]:
# Main execution
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part1(df)
    
    # 3. Train model
    model = train_logistic_regression(X_train, y_train)
    
    # 4. Evaluate model
    metrics = calculate_evaluation_metrics(model, X_test, y_test)
    
    # 5. Print metrics
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value:.4f}")
    
    # 6. Save results
    # (Your code for saving results)
    # Create results directory and save metrics
    # YOUR CODE HERE
    
    # 1. Create 'results' directory if it doesn't exist
    # 2. Format metrics as strings
    # 3. Write metrics to 'results/results_part1.txt'

#     os.makedirs('results', exist_ok=True) # create dir if does not exist

#     output_file = 'results/results_part1.txt'

#     with open(output_file, "w") as file: # open file for writing

#         file.write(f"Accuracy: {metrics['accuracy']}\n") # write results to output file
#         file.write(f"Precision: {metrics['precision']}\n")
#         file.write(f"Recall: {metrics['recall']}\n")
#         file.write(f"F1 Score: {metrics['f1']}\n")
#         file.write(f"Confusion Matrix: {metrics['confusion_matrix']}\n")

#     file.close()
    metrics_json = {
    "accuracy": float(metrics['accuracy']),
    "precision": float(metrics['precision']),
    "recall": float(metrics['recall']),
    "f1": float(metrics['f1']),
    "confusion_matrix": metrics['confusion_matrix'].tolist()  # convert from NumPy array
    }

    os.makedirs('results', exist_ok=True)

    # Save to JSON file
    output_file = 'results/results_part1.txt'
    with open(output_file, 'w') as f:
        json.dump(metrics_json, f, indent=4)

    print(f"\nMetrics saved to {output_file}")


    print(f"\nSynthetic data saved to {output_file}")

    # 7. Interpret results
    interpretation = interpret_results(metrics)
    print("\nResults Interpretation:")
    for key, value in interpretation.items():
        print(f"{key}: {value}")

accuracy: 0.9245
precision: 0.6741
recall: 0.4272
f1: 0.5230
auc: 0.7025


NameError: name 'json' is not defined