# PCOS Detection Model Training

This notebook trains a model for PCOS detection and generates explainability artifacts using SHAP.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import shap
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Set paths
data_path = '../data/sample_data.csv'
model_dir = '../../models'
os.makedirs(model_dir, exist_ok=True)

In [None]:
# Load the dataset
df = pd.read_csv(data_path)
df.head()

In [None]:
# Data preprocessing
# Convert date to datetime and extract features
df['last_period_date'] = pd.to_datetime(df['last_period_date'])
df['days_since_last_period'] = (pd.Timestamp.now() - df['last_period_date']).dt.days

# Drop original date column
df = df.drop(['last_period_date'], axis=1)

# Encode categorical variables
le_hormone = LabelEncoder()
df['hormone_test_values'] = le_hormone.fit_transform(df['hormone_test_values'])

# Save encoders
encoders = {
    'hormone_test_values': le_hormone
}
with open(f'{model_dir}/encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

# Define features and target
features = [col for col in df.columns if col != 'pcos_diagnosis']
X = df[features]
y = df['pcos_diagnosis']

# Save feature names
with open(f'{model_dir}/feature_names.pkl', 'wb') as f:
    pickle.dump(features, f)

print(f"Features: {features}")
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the model
with open(f'{model_dir}/model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig(f'{model_dir}/feature_importance.png')
plt.show()

print(feature_importance)

In [None]:
# SHAP explainability
# Create a SHAP explainer
explainer = shap.TreeExplainer(model)

# Save the explainer
with open(f'{model_dir}/explainer.pkl', 'wb') as f:
    pickle.dump(explainer, f)

# Calculate SHAP values for the test set
shap_values = explainer.shap_values(X_test)

# Save a sample of SHAP values for demo
sample_shap = {
    'values': shap_values[1][:5],  # First 5 samples for class 1 (PCOS)
    'base_values': explainer.expected_value[1],
    'data': X_test.iloc[:5].values
}
with open(f'{model_dir}/sample_shap.pkl', 'wb') as f:
    pickle.dump(sample_shap, f)

# Generate SHAP summary plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values[1], X_test, plot_type="bar", show=False)
plt.tight_layout()
plt.savefig(f'{model_dir}/shap_bar.png')
plt.close()

# Generate SHAP beeswarm plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values[1], X_test, show=False)
plt.tight_layout()
plt.savefig(f'{model_dir}/shap_beeswarm.png')
plt.close()

# Generate SHAP dependence plot for the most important feature
most_important_feature = feature_importance.iloc[0]['feature']
plt.figure(figsize=(10, 6))
shap.dependence_plot(most_important_feature, shap_values[1], X_test, show=False)
plt.tight_layout()
plt.savefig(f'{model_dir}/shap_dependence.png')
plt.close()

print("SHAP plots saved to /models directory")

In [None]:
# Generate a sample prediction with explanation
sample_idx = 0
sample_input = X_test.iloc[sample_idx:sample_idx+1]
sample_prediction = model.predict(sample_input)[0]
sample_prediction_proba = model.predict_proba(sample_input)[0]

# Get SHAP values for this sample
sample_shap_values = explainer.shap_values(sample_input)

# Create a DataFrame for feature contributions
feature_contributions = pd.DataFrame({
    'feature': features,
    'contribution': sample_shap_values[1][0]  # For class 1 (PCOS)
}).sort_values('contribution', ascending=False)

print(f"Sample Input: {sample_input.values[0]}")
print(f"Prediction: {'PCOS' if sample_prediction == 1 else 'No PCOS'}")
print(f"Prediction Probability: {sample_prediction_proba[1]:.4f}")
print("\nTop 5 Feature Contributions:")
print(feature_contributions.head())

# Save sample prediction with explanation
sample_prediction_data = {
    'input': sample_input.values[0].tolist(),
    'prediction': int(sample_prediction),
    'probability': float(sample_prediction_proba[1]),
    'feature_contributions': feature_contributions.head().to_dict('records')
}

with open(f'{model_dir}/sample_prediction.pkl', 'wb') as f:
    pickle.dump(sample_prediction_data, f)

## Model Training Complete

The following artifacts have been saved to the `/models` directory:
- `model.pkl`: Trained RandomForest model
- `encoders.pkl`: Label encoders for categorical variables
- `feature_names.pkl`: List of feature names
- `explainer.pkl`: SHAP explainer object
- `sample_shap.pkl`: Sample SHAP values for demo
- `sample_prediction.pkl`: Sample prediction with explanation
- `feature_importance.png`: Feature importance plot
- `shap_bar.png`: SHAP bar plot
- `shap_beeswarm.png`: SHAP beeswarm plot
- `shap_dependence.png`: SHAP dependence plot for the most important feature