# Accidental Drug-Related Deaths – Predictive Modeling & Risk Profiling

**Objective:** Analyze toxicology and demographic data to predict drug-related fatalities and uncover risk factors.

## 1. Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

## 2. Load and Explore Data

In [None]:
df = pd.read_csv("Accidental_Drug_Related_Deaths.csv")
print("Shape:", df.shape)
df.head()

## 3. Data Cleaning

In [None]:
# Drop irrelevant or highly null columns
df.drop(columns=["Death City", "Autopsy", "Residence City"], inplace=True, errors='ignore')
df = df.dropna(subset=["Age", "Sex", "Race"])
df.fillna(0, inplace=True)
print("Cleaned Shape:", df.shape)

## 4. Feature Engineering

In [None]:
df['Fatal'] = np.where(df['Heroin'] == 1, 1, 0)
features = ['Age', 'Sex', 'Race', 'Fentanyl', 'Oxycodone', 'Benzodiazepine', 'Cocaine']
df = pd.get_dummies(df, columns=['Sex', 'Race'], drop_first=True)
X = df[features + [col for col in df.columns if col.startswith('Sex_') or col.startswith('Race_')]]
y = df['Fatal']

## 5. Train-Test Split and Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6. Modeling and Evaluation

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    auc = roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:,1])
    print(f"\n{name} Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("ROC AUC:", round(auc, 2))

## 7. ROC Curve

In [None]:
plt.figure(figsize=(8,6))
for name, model in models.items():
    y_score = model.predict_proba(X_test_scaled)[:,1]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc_score(y_test, y_score):.2f})')
plt.plot([0,1], [0,1], 'k--')
plt.title("ROC Curve Comparison")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## ✅ Conclusion


- **Random Forest** had the best performance with highest accuracy and AUC.
- **Heroin**, **Fentanyl**, and **Benzodiazepines** were key predictive features.
- ROC curves confirmed model robustness.
- Future enhancements: geographic and temporal features, deployment via dashboard/API.
    