In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer

Part 1 - Loading the data

In [2]:
def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """
    #reading data
    df = pd.read_csv(file_path)
    
    return df 

df = load_data("/workspaces/5-put-a-label-on-it-rishim3000/data/synthetic_health_data.csv")


Part 2 - Data Preparation

In [3]:
def prepare_data_part1(df, test_size=0.2, random_state=42):
    """
    Prepare data for modeling: select features, split into train/test sets, handle missing values.
    
    Args:
        df: Input DataFrame
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    #Feature and target variable selection
    features = ['age', 'systolic_bp', 'diastolic_bp', 'glucose_level', 'bmi']
    target = ['disease_outcome']

    X = df[features]
    y = df[target]

    #replacing missing values of systolic bp and glucose with mean
    imputer = SimpleImputer(strategy = 'mean')

    X.loc[:, ['systolic_bp', 'glucose_level']] = imputer.fit_transform(X[['systolic_bp', 'glucose_level']])

    #Training-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = test_size, random_state = random_state)
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = prepare_data_part1(df)


Part 4 - Model Training

In [4]:
def train_logistic_regression(X_train, y_train):
    """
    Train a logistic regression model.
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Trained logistic regression model
    """
    #initialize and train logistic regression model
    model = LogisticRegression(max_iter = 10000, random_state=0)
    model.fit(X_train, y_train)
    
    return model

model = train_logistic_regression(X_train, y_train)

  y = column_or_1d(y, warn=True)


Part 5 - Evaulating Model


In [5]:
def calculate_evaluation_metrics(model, X_test, y_test):
    """
    Calculate classification evaluation metrics.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target
        
    Returns:
        Dictionary containing accuracy, precision, recall, f1, auc, and confusion_matrix
    """

    y_pred = model.predict(X_test)

    metrics_dict = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'AUC': float(roc_auc_score(y_test, y_pred)),
        'Confusion_Matrix': confusion_matrix(y_test, y_pred)
    }

    return metrics_dict

calculate_evaluation_metrics(model, X_test, y_test)

{'Accuracy': 0.9167803547066848,
 'Precision': 0.6615384615384615,
 'Recall': 0.3006993006993007,
 'F1': 0.41346153846153844,
 'AUC': 0.6420352134637849,
 'Confusion_Matrix': array([[1301,   22],
        [ 100,   43]])}

Part 6 - Saving Results

In [8]:
# Create results directory and save metrics
# YOUR CODE HERE
# 1. Create 'results' directory if it doesn't exist
# 2. Format metrics as strings
# 3. Write metrics to 'results/results_part1.txt'
import json
metrics_dict = calculate_evaluation_metrics(model, X_test, y_test)

os.makedirs('results', exist_ok=True)
directory = 'results'
filename = 'results.txt'
filepath = os.path.join(directory, filename)


with open(filepath, 'w') as f:
    for key, value in metrics_dict.items():
        if isinstance(value, (list, np.ndarray)):
            f.write(f"{key}: {json.dumps(value.tolist() if isinstance(value, np.ndarray) else value)}\n")
        elif isinstance(value, (float, int)):
            f.write(f"{key}: {value:.4f}\n")
        else:
            f.write(f"{key}: {str(value)}\n")

print(f"Metrics have been written to '{filepath}'.")


Metrics have been written to 'results/results.txt'.


Part 7 - Main Execution

In [28]:
if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Prepare data
    X_train, X_test, y_train, y_test = prepare_data_part1(df)
    
    # 3. Train model
    model = train_logistic_regression(X_train, y_train)
    
    # 4. Evaluate model
    metrics = calculate_evaluation_metrics(model, X_test, y_test)
    
    # 5. Print metrics
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value}")
    
    #6. Saving Results
    metrics_list = []

    for key in metrics:
        metrics[key] = str(metrics[key]).replace('\n', '')
        metrics_list.append(metrics[key])

    metrics_str = str(metrics_list)

    os.makedirs('results', exist_ok=True)
    directory = 'results'
    filename = 'results.txt'
    filepath = os.path.join(directory, filename)

    with open(filepath, "w") as file:
        file.write(metrics_str)
        file.close()

Accuracy: 0.9167803547066848
Precision: 0.6615384615384615
Recall: 0.3006993006993007
F1: 0.41346153846153844
AUC: 0.6420352134637849
Confusion Matrix: [[1301   22]
 [ 100   43]]


  y = column_or_1d(y, warn=True)


Part 8 - Interpret Results