# 🩺 The Hidden Danger in ML: Pneumonia & Asthma Case Study
This project explores how biased training data can lead to incorrect machine learning predictions, using a real-world-inspired case.

## Step 1: Simulate Biased Dataset

In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)
n = 1000
data = pd.DataFrame({
    "age": np.random.randint(20, 90, size=n),
    "has_asthma": np.random.choice([0, 1], size=n, p=[0.85, 0.15]),
    "severity_score": np.random.normal(loc=50, scale=10, size=n),
})

# Deaths without considering asthma severity
data["death"] = (
    (data["severity_score"] > 60).astype(int) |
    (data["age"] > 80).astype(int)
)
data.loc[data["has_asthma"] == 1, "death"] = 0  # ICU overrides risk

data.head()

Unnamed: 0,age,has_asthma,severity_score,death
0,71,0,56.86566,0
1,34,0,27.867727,0
2,80,0,61.96895,1
3,40,0,50.830805,0
4,43,1,53.124917,0


## Step 2: Train Logistic Regression on Biased Data

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = data[["age", "has_asthma", "severity_score"]]
y = data["death"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

print("Classification report on biased data:")
print(classification_report(y_test, model.predict(X_test)))

Classification report on biased data:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       156
           1       0.61      0.61      0.61        44

    accuracy                           0.83       200
   macro avg       0.75      0.75      0.75       200
weighted avg       0.83      0.83      0.83       200



### Model Coefficients

In [3]:
import pandas as pd
coeffs = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_[0]
})
coeffs

Unnamed: 0,Feature,Coefficient
0,age,0.071178
1,has_asthma,-3.665603
2,severity_score,0.181166


## Step 3: Simulate Correct Death Labels and Retrain

In [8]:
corrected = data.copy()
corrected["death"] = (
    (corrected["severity_score"] > 60).astype(int) |
    (corrected["age"] > 80).astype(int) |
    (corrected["has_asthma"] == 1).astype(int)
)

print(corrected)

X_corr = corrected[["age", "has_asthma", "severity_score"]]
y_corr = corrected["death"]
X_train_corr, X_test_corr, y_train_corr, y_test_corr = train_test_split(X_corr, y_corr, test_size=0.2, random_state=42)

model_corr = LogisticRegression()
model_corr.fit(X_train_corr, y_train_corr)

print("Classification report on corrected data:")
print(classification_report(y_test_corr, model_corr.predict(X_test_corr)))

     age  has_asthma  severity_score  death
0     71           0       56.865660      0
1     34           0       27.867727      0
2     80           0       61.968950      1
3     40           0       50.830805      0
4     43           1       53.124917      1
..   ...         ...             ...    ...
995   73           0       40.982184      0
996   33           0       57.372913      0
997   74           0       48.413833      0
998   67           0       50.424825      0
999   26           0       39.961089      0

[1000 rows x 4 columns]
Classification report on corrected data:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       123
           1       0.78      0.78      0.78        77

    accuracy                           0.83       200
   macro avg       0.82      0.82      0.82       200
weighted avg       0.83      0.83      0.83       200



### Model Coefficients After Correction

In [5]:
coeffs_corr = pd.DataFrame({
    "Feature": X_corr.columns,
    "Coefficient": model_corr.coef_[0]
})
coeffs_corr

Unnamed: 0,Feature,Coefficient
0,age,0.063789
1,has_asthma,5.836247
2,severity_score,0.163218


## 🧠 Reflection Questions
1. What did the model originally learn about asthma?
2. How did the model's behavior change after we fixed the labels?
3. Why is understanding the context of data collection important?
4. What could happen if hospitals used the first (biased) model in practice?