In [1]:
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
#1: Setup

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

In [10]:
#2 Data Loading

def load_data(file_path):
    """
    Load the synthetic health data from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        DataFrame containing the data
    """
    # YOUR CODE HERE
    # Load the CSV file using pandas
    
    return pd.read_csv(file_path)

df = load_data('/workspaces/5-put-a-label-on-it-rishim3000/data/synthetic_health_data.csv')

In [11]:
#3 - Categorical Feature Encoding

def encode_categorical_features(df, column_to_encode='smoker_status'):
    """
    Encode a categorical column using OneHotEncoder.
    
    Args:
        df: Input DataFrame
        column_to_encode: Name of the categorical column to encode
        
    Returns:
        DataFrame with the categorical column replaced by one-hot encoded columns
    """
    encoder = OneHotEncoder(sparse_output = False, drop = 'first')
    encoded = encoder.fit_transform(df[[column_to_encode]])
    encoded_columns = encoder.get_feature_names_out([column_to_encode])
    df_encoded = pd.DataFrame(encoded, columns=encoded_columns, index = df.index)
    df = pd.concat([df.drop(columns = [column_to_encode]), df_encoded], axis = 1)
    # Placeholder return - replace with your implementation
    return df

encode_categorical_features(df, column_to_encode='smoker_status')

Unnamed: 0,patient_id,timestamp,age,systolic_bp,diastolic_bp,glucose_level,bmi,heart_rate,disease_outcome,smoker_status_no,smoker_status_yes
0,1,2023-01-29 00:00:00.000000,57,113.063416,84.069561,117.475210,25.085796,62.719587,0,1.0,0.0
1,1,2023-01-31 07:33:55.507789,57,121.598849,89.672279,85.120875,24.120608,76.314434,0,1.0,0.0
2,1,2023-02-02 00:15:11.379377,57,126.623222,87.619685,,24.819332,62.427785,0,1.0,0.0
3,1,2023-02-04 09:37:12.589164,57,136.999366,89.199774,118.755648,25.039598,61.612981,0,1.0,0.0
4,1,2023-02-04 20:56:52.838198,57,127.546919,92.644673,98.882007,24.895024,77.649615,0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
7321,150,2023-03-18 09:08:49.029823,54,115.038254,79.241741,84.586944,29.968156,73.599447,0,1.0,0.0
7322,150,2023-03-20 14:38:22.129593,54,116.389186,70.464818,91.476621,29.519510,64.162701,0,1.0,0.0
7323,150,2023-03-23 09:26:04.210673,54,123.419606,88.213054,96.985434,29.786678,71.641423,0,1.0,0.0
7324,150,2023-03-27 14:17:19.255961,54,,69.539940,85.670800,29.188655,72.781243,0,1.0,0.0


In [12]:
#4 - Data Prep
def prepare_data_part3(df, test_size=0.2, random_state=42):
    """
    Prepare data with categorical encoding.
    
    Args:
        df: Input DataFrame
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    df = encode_categorical_features(df)
    #selected all categorical features including one hote encoded
    features = features = [col for col in df.columns if col not in ['patient_id', 'timestamp', 'disease_outcome']]

    X = df[features]
    y = df['disease_outcome']
    imputer = SimpleImputer(strategy  = 'mean')
    X_imputed = imputer.fit_transform(X)    
    # Placeholder return - replace with your implementation
    return train_test_split(X_imputed, y, test_size = test_size, random_state = random_state)

X_train, X_test, y_train, y_test = prepare_data_part3(df, test_size=0.2, random_state=42)

In [13]:
#Imbalanced Data

def apply_smote(X_train, y_train, random_state=42):
    """
    Apply SMOTE to oversample the minority class.
    
    Args:
        X_train: Training features
        y_train: Training target
        random_state: Random seed for reproducibility
        
    Returns:
        Resampled X_train and y_train with balanced classes
    """
    smote = SMOTE(random_state = random_state)
    return smote.fit_resample(X_train, y_train)

apply_smote(X_train, y_train, random_state=42)


(array([[ 63.        , 118.90240003,  67.32984448, ...,  88.93652402,
           1.        ,   0.        ],
        [ 21.        , 124.9311563 ,  95.4349238 , ...,  61.82602609,
           0.        ,   0.        ],
        [ 45.        , 116.31968843,  84.72964664, ...,  74.9138022 ,
           1.        ,   0.        ],
        ...,
        [ 29.87106455, 135.25733364,  90.47406096, ...,  76.38164677,
           0.62368818,   0.37631182],
        [ 21.        , 133.16944515,  87.79471834, ...,  57.61226578,
           0.        ,   1.        ],
        [ 35.        , 139.97748374, 100.77837016, ...,  93.46452139,
           0.        ,   0.        ]], shape=(10582, 8)),
 0        0
 1        0
 2        0
 3        0
 4        0
         ..
 10577    1
 10578    1
 10579    1
 10580    1
 10581    1
 Name: disease_outcome, Length: 10582, dtype: int64)

In [17]:
#6 - Model Training and Evaluation

def train_logistic_regression(X_train, y_train):
    """
    Train a logistic regression model.
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Trained logistic regression model
    """
    model = LogisticRegression(max_iter = 10000)
    model.fit(X_train, y_train)
    
    return model

model = train_logistic_regression(X_train, y_train)

def calculate_evaluation_metrics(model, X_test, y_test):
    """
    Calculate classification evaluation metrics.
    
    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target
        
    Returns:
        Dictionary containing accuracy, precision, recall, f1, auc, and confusion_matrix
    """
    y_preds = model.predict(X_test)
    y_probs = model.predict_proba(X_test)[:, 1]

    return {
        'accuracy': accuracy_score(y_test, y_preds),
        'precision': precision_score(y_test, y_preds),
        'recall': recall_score(y_test, y_preds),
        'f1': f1_score(y_test, y_preds),
        'auc': roc_auc_score(y_test, y_probs),
        'confusion_matrix': confusion_matrix(y_test, y_preds).tolist()
    }

metrics = calculate_evaluation_metrics(model, X_test, y_test)

In [18]:
#7 - Save results
import json

with open('results/results_part3.txt', 'w') as f:
    for key, value in metrics.items():
        if isinstance(value, list):  # confusion matrix
            f.write(f"{key}: {json.dumps(value)}\n")
        else:
            f.write(f"{key}: {value:.4f}\n")

In [22]:
#8 - Main Execution

if __name__ == "__main__":
    # 1. Load data
    data_file = 'data/synthetic_health_data.csv'
    df = load_data(data_file)
    
    # 2. Prepare data with categorical encoding
    X_train, X_test, y_train, y_test = prepare_data_part3(df)
    
    # 3. Apply SMOTE to balance the training data
    X_train_resampled, y_train_resampled = apply_smote(X_train, y_train)
    
    # 4. Train model on resampled data
    model = train_logistic_regression(X_train_resampled, y_train_resampled)
    
    # 5. Evaluate on original test set
    metrics = calculate_evaluation_metrics(model, X_test, y_test)
    
    # 6. Print metrics
    for metric, value in metrics.items():
        if metric != 'confusion_matrix':
            print(f"{metric}: {value:.4f}")
    
    # 7. Save results
    import json

    with open('results/results_part3.txt', 'w') as f:
        for key, value in metrics.items():
            if isinstance(value, list):  # confusion matrix
                f.write(f"{key}: {json.dumps(value)}\n")
            else:
                f.write(f"{key}: {value:.4f}\n")


accuracy: 0.8568
precision: 0.3930
recall: 0.8601
f1: 0.5395
auc: 0.9263


In [23]:
%run part1_introduction.ipynb
print(metrics_dict)
'''
def compare_models(part1_metrics, part3_metrics):
    """
    Calculate percentage improvement between models trained on imbalanced vs. balanced data.
    
    Args:
        part1_metrics: Dictionary containing evaluation metrics from Part 1 (imbalanced)
        part3_metrics: Dictionary containing evaluation metrics from Part 3 (balanced)
        
    Returns:
        Dictionary with metric names as keys and improvement percentages as values
    """
    pct_changes = {}
    for key in :
        old_val = old_metrics[key]
        new_val = new_metrics[key]

        if isinstance(old_val, (int, float)) and isinstance(new_val, (int, float)):
            if old_val != 0:
                change = ((new_val - old_val) / abs(old_val)) * 100
                pct_changes[key] = round(change, 2)
            else:
                pct_changes[key] = float('inf') if new_val != 0 else 0.0
        else:
            pct_changes[key] = None  # Skip or handle non-numeric types separately

    return pct_changes
'''

Metrics have been written to 'results/results.txt'.
Accuracy: 0.9167803547066848
Precision: 0.6615384615384615
Recall: 0.3006993006993007
F1: 0.41346153846153844
AUC: 0.6420352134637849
Confusion_Matrix: [[1301   22]
 [ 100   43]]
{'Accuracy': 0.9167803547066848, 'Precision': 0.6615384615384615, 'Recall': 0.3006993006993007, 'F1': 0.41346153846153844, 'AUC': 0.6420352134637849, 'Confusion_Matrix': array([[1301,   22],
       [ 100,   43]])}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


'\ndef compare_models(part1_metrics, part3_metrics):\n    """\n    Calculate percentage improvement between models trained on imbalanced vs. balanced data.\n\n    Args:\n        part1_metrics: Dictionary containing evaluation metrics from Part 1 (imbalanced)\n        part3_metrics: Dictionary containing evaluation metrics from Part 3 (balanced)\n\n    Returns:\n        Dictionary with metric names as keys and improvement percentages as values\n    """\n    pct_changes = {}\n    for key in :\n        old_val = old_metrics[key]\n        new_val = new_metrics[key]\n\n        if isinstance(old_val, (int, float)) and isinstance(new_val, (int, float)):\n            if old_val != 0:\n                change = ((new_val - old_val) / abs(old_val)) * 100\n                pct_changes[key] = round(change, 2)\n            else:\n                pct_changes[key] = float(\'inf\') if new_val != 0 else 0.0\n        else:\n            pct_changes[key] = None  # Skip or handle non-numeric types separately