<a href="https://colab.research.google.com/github/AchiengMary/female_health_models/blob/main/Personalized_Treatment_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd

# Load datasets
cyst_df = pd.read_csv("Ovarian_Cyst.csv")
resource_df = pd.read_csv("Resources_Inventory.csv")
treatment_df = pd.read_csv("Treatment_Costs.csv")

In [4]:
print("🧬 Cyst Data Overview")
print(cyst_df.info())
print("\n🔢 Summary Statistics:")
print(cyst_df.describe(include='all'))
print("\n📌 Missing Values:")
print(cyst_df.isnull().sum())
print("\n🎯 Unique values in key columns:")
print("Regions:", cyst_df['Region'].unique())
print("Menopause Status:", cyst_df['Menopause Status'].unique())
print("Recommended Management:", cyst_df['Recommended Management'].unique())

🧬 Cyst Data Overview
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Patient ID                 100 non-null    object 
 1   Age                        100 non-null    int64  
 2   Menopause Status           100 non-null    object 
 3   Cyst Size cm               100 non-null    float64
 4   Cyst Growth Rate cm/month  100 non-null    float64
 5   CA 125 Level               100 non-null    int64  
 6   Ultrasound Features        100 non-null    object 
 7   Reported Symptoms          100 non-null    object 
 8   Recommended Management     100 non-null    object 
 9   Date of Exam               100 non-null    object 
 10  Region                     100 non-null    object 
dtypes: float64(2), int64(2), object(7)
memory usage: 8.7+ KB
None

🔢 Summary Statistics:
       Patient ID         Age Menopause Status  Cyst Size

In [5]:
print("🏥 Resource Inventory Overview")
print(resource_df.info())
print("\n🔢 Summary Statistics:")
print(resource_df.describe(include='all'))
print("\n📌 Missing Values:")
print(resource_df.isnull().sum())
print("\n📦 Unique Facilities & Categories:")
print("Facilities:", resource_df['Facility'].nunique())
print("Regions:", resource_df['Region'].unique())
print("Categories:", resource_df['Category'].unique())
print("Items:", resource_df['Item'].nunique())

🏥 Resource Inventory Overview
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Facility         100 non-null    object 
 1   Region           100 non-null    object 
 2   Category         100 non-null    object 
 3   Item             100 non-null    object 
 4   Cost (KES)       100 non-null    float64
 5   Available Stock  100 non-null    int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB
None

🔢 Summary Statistics:
                       Facility Region     Category  \
count                       100    100          100   
unique                       10     19            3   
top     Mombasa County Hospital  Embu   Medications   
freq                         14     14           46   
mean                        NaN    NaN          NaN   
std                         NaN    NaN          NaN   
min                     

In [6]:
print("💳 Treatment Cost Overview")
print(treatment_df.info())
print("\n🔢 Summary Statistics:")
print(treatment_df.describe(include='all'))
print("\n📌 Missing Values:")
print(treatment_df.isnull().sum())
print("\n📋 Sample Cost Rows:")
print(treatment_df[['Service', 'Base Cost (KES)', 'NHIF Covered', 'Insurance Copay (KES)', 'Out-of-Pocket (KES)']].head())
print("\n🏥 Unique Facilities and Services:")
print("Facilities:", treatment_df['Facility'].nunique())
print("Services:", treatment_df['Service'].nunique())

💳 Treatment Cost Overview
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Facility               100 non-null    object 
 1   Region                 100 non-null    object 
 2   Category               100 non-null    object 
 3   Service                100 non-null    object 
 4   Base Cost (KES)        100 non-null    float64
 5   NHIF Covered           100 non-null    object 
 6   Insurance Copay (KES)  100 non-null    float64
 7   Out-of-Pocket (KES)    100 non-null    float64
dtypes: float64(3), object(5)
memory usage: 6.4+ KB
None

🔢 Summary Statistics:
                       Facility Region  Category             Service  \
count                       100    100       100                 100   
unique                       10     19         5                  10   
top     Mombasa County Hospital  Embu   Lab Test  Ovari

In [8]:
# Clean whitespace in key string columns
for df in [cyst_df, resource_df, treatment_df]:
    df['Region'] = df['Region'].str.strip()
    if 'Facility' in df.columns:
        df['Facility'] = df['Facility'].str.strip()

In [9]:
# Step 1: Merge cyst_df with resource_df on Region and Category
cyst_resource_df = cyst_df.merge(
    resource_df,
    how='left',
    on='Region',
    suffixes=('', '_resource')
)

# Step 2: Merge with treatment_df on Region and Category
full_df = cyst_resource_df.merge(
    treatment_df,
    how='left',
    on=['Region', 'Category'],
    suffixes=('', '_treatment')
)

In [10]:
#Verify merged dataset
print(full_df.shape)
print(full_df[['Patient ID', 'Region', 'Facility', 'Category', 'Item', 'Service', 'Available Stock', 'Base Cost (KES)', 'Out-of-Pocket (KES)']].head())

(488, 22)
  Patient ID     Region                       Facility         Category  \
0    OC-1000    Eldoret                            NaN              NaN   
1    OC-1001  Loitoktok  Loitoktok Sub-County Hospital      Medications   
2    OC-1001  Loitoktok  Loitoktok Sub-County Hospital      Consumables   
3    OC-1001  Loitoktok  Loitoktok Sub-County Hospital      Consumables   
4    OC-1001  Loitoktok  Loitoktok Sub-County Hospital  Lab Consumables   

                  Item Service  Available Stock  Base Cost (KES)  \
0                  NaN     NaN              NaN              NaN   
1    Doxycycline 100mg     NaN             89.0              NaN   
2  Latex Gloves (pair)     NaN             68.0              NaN   
3             Speculum     NaN             51.0              NaN   
4      CA-125 Test Kit     NaN             70.0              NaN   

   Out-of-Pocket (KES)  
0                  NaN  
1                  NaN  
2                  NaN  
3                  NaN  
4    

In [11]:
#Handle categorical variables

# Copy dataframe
data = full_df.copy()

# One-hot encode categorical variables
categorical_cols = [
    'Menopause Status',
    'Ultrasound Features',
    'Region',
    'Category',
    'NHIF Covered'
]

data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

In [12]:
# Split multi-symptom strings into individual binary flags
symptom_series = data['Reported Symptoms'].str.get_dummies(sep=',')

# Clean column names
symptom_series.columns = symptom_series.columns.str.strip()

# Join to main dataframe
data = pd.concat([data, symptom_series], axis=1)

# Drop original symptom text
data.drop('Reported Symptoms', axis=1, inplace=True)

In [13]:
# Drop unnecessary columns
drop_cols = ['Patient ID', 'Date of Exam', 'Facility', 'Item', 'Service']
data.drop(columns=[col for col in drop_cols if col in data.columns], inplace=True)

# Drop rows with missing target
data = data.dropna(subset=['Recommended Management'])

# Optional: Drop rows with missing inputs
data = data.dropna()

In [14]:
# Label encode target (e.g., Observation → 0, Medication → 1, etc.)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['Recommended Management'] = le.fit_transform(data['Recommended Management'])

# Save for inverse_transform later
target_labels = list(le.classes_)

In [15]:
# Separate features and target
X = data.drop('Recommended Management', axis=1)
y = data['Recommended Management']

In [16]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [17]:
print(X.shape)
print(y.shape)
print(X.head())
print(y.value_counts())

(0, 31)
(0,)
Empty DataFrame
Columns: [Age, Cyst Size cm, Cyst Growth Rate cm/month, CA 125 Level, Cost (KES), Available Stock, Facility_treatment, Base Cost (KES), Insurance Copay (KES), Out-of-Pocket (KES), Menopause Status_Pre-menopausal, Ultrasound Features_Hemorrhagic cyst, Ultrasound Features_Septated cyst, Ultrasound Features_Simple cyst, Ultrasound Features_Solid mass, Region_Kisumu, Region_Loitoktok, Region_Mombasa, Region_Nairobi, Category_Lab Consumables, Category_Medications, Bloating, Fatigue, Irregular periods, Nausea, Pelvic pain, Bloating, Fatigue, Irregular periods, Nausea, Pelvic pain]
Index: []

[0 rows x 31 columns]
Series([], Name: count, dtype: int64)


In [18]:
y = final_df['Recommended Management']
X = final_df.drop(['Recommended Management', 'Patient ID', 'Date of Exam'], axis=1)

NameError: name 'final_df' is not defined

In [19]:
# Merge cyst data with treatment cost on Region and Category (or any other common columns)
merged_df = cyst_df.merge(treatment_df, on='Region', how='left')
merged_df = merged_df.merge(resource_df, on=['Region', 'Category'], how='left')

# Save to final_df
final_df = merged_df


In [20]:
print(final_df.shape)
print(final_df.columns)
print(final_df['Recommended Management'].value_counts())

(488, 22)
Index(['Patient ID', 'Age', 'Menopause Status', 'Cyst Size cm',
       'Cyst Growth Rate cm/month', 'CA 125 Level', 'Ultrasound Features',
       'Reported Symptoms', 'Recommended Management', 'Date of Exam', 'Region',
       'Facility_x', 'Category', 'Service', 'Base Cost (KES)', 'NHIF Covered',
       'Insurance Copay (KES)', 'Out-of-Pocket (KES)', 'Facility_y', 'Item',
       'Cost (KES)', 'Available Stock'],
      dtype='object')
Recommended Management
Observation    143
Surgery        134
Medication     130
Referral        81
Name: count, dtype: int64


In [21]:
y = final_df['Recommended Management']

# Drop columns that shouldn't go into modeling (IDs, dates, target)
X = final_df.drop(['Recommended Management', 'Patient ID', 'Date of Exam'], axis=1, errors='ignore')

# Optional: Encode categorical variables
X = pd.get_dummies(X, drop_first=True)

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Train classification models

try out 3 models,

Logistic Regression (baseline)
Random Forest Classifier
XGBoost

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier  # if xgboost is installed

In [24]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

for name, model in models.items():
    print(f"🧠 {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("-" * 60)


🧠 Logistic Regression


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

X_train contains NaN (missing) values, which LogisticRegression and most models do not support directly. to retain data size and avoid loss of patterns.



In [25]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')  # or 'mean' for numerical columns
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)



ValueError: Shape of passed values is (390, 75), indices imply (390, 77)

The imputer dropped 2 columns ('Cost (KES)' and 'Available Stock') entirely because they contain only missing values, and therefore can't be imputed.

In [26]:
X_train = X_train.dropna(axis=1, how='all')
X_test = X_test.dropna(axis=1, how='all')

In [27]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [28]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

for name, model in models.items():
    print(f"🧠 {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("-" * 60)

🧠 Logistic Regression


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.35714285714285715
Confusion Matrix:
 [[ 9  5  4  8]
 [ 7 13  1  8]
 [ 7  6  2  1]
 [ 9  6  1 11]]
Classification Report:
               precision    recall  f1-score   support

  Medication       0.28      0.35      0.31        26
 Observation       0.43      0.45      0.44        29
    Referral       0.25      0.12      0.17        16
     Surgery       0.39      0.41      0.40        27

    accuracy                           0.36        98
   macro avg       0.34      0.33      0.33        98
weighted avg       0.35      0.36      0.35        98

------------------------------------------------------------
🧠 Random Forest
Accuracy: 0.8673469387755102
Confusion Matrix:
 [[22  2  0  2]
 [ 1 27  1  0]
 [ 3  0 11  2]
 [ 2  0  0 25]]
Classification Report:
               precision    recall  f1-score   support

  Medication       0.79      0.85      0.81        26
 Observation       0.93      0.93      0.93        29
    Referral       0.92      0.69      0.79        16
    

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3], got ['Medication' 'Observation' 'Referral' 'Surgery']

In [29]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Encode the target labels just for XGBoost
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Fit XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train_enc)

# Predict and inverse-transform to original labels
y_pred_enc = xgb_model.predict(X_test)
y_pred = le.inverse_transform(y_pred_enc)

# Evaluate
print("🧠 XGBoost")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


🧠 XGBoost
Accuracy: 0.8979591836734694
Confusion Matrix:
 [[24  2  0  0]
 [ 1 27  1  0]
 [ 2  0 13  1]
 [ 2  1  0 24]]
Classification Report:
               precision    recall  f1-score   support

  Medication       0.83      0.92      0.87        26
 Observation       0.90      0.93      0.92        29
    Referral       0.93      0.81      0.87        16
     Surgery       0.96      0.89      0.92        27

    accuracy                           0.90        98
   macro avg       0.90      0.89      0.89        98
weighted avg       0.90      0.90      0.90        98



Parameters: { "use_label_encoder" } are not used.



In [30]:
import matplotlib.pyplot as plt
xgb.plot_importance(xgb_model)
plt.show()

NameError: name 'xgb' is not defined

In [31]:
import joblib
joblib.dump(xgb_model, "xgboost_model.pkl")

['xgboost_model.pkl']

In [38]:
# Step 1: New raw input
new_patient_raw = pd.DataFrame([{
    'Age': 36,
    'Region': 'Nairobi',
    'Cyst Size (cm)': 5.2,
    'Symptoms': 'Fatigue, Pelvic pain',
    'Facility': 'City Hospital',
    'Available Stock': 'Yes',
    'Cost (KES)': 12000,
    'Has Insurance': 'Yes',
    'Menopause Status': 'Pre-menopausal',
    'Ultrasound Features': 'Solid mass',
    'Reported Symptoms': 'Fatigue, Pelvic pain',
    'Base Cost (KES)': 15000,
    'Insurance Copay (KES)': 3000,
    'Out-of-Pocket (KES)': 12000,
}])

# Step 2: One-hot encode
new_patient_encoded = pd.get_dummies(new_patient_raw)

# Step 3: Add missing columns (fill with 0)
for col in X_train.columns:
    if col not in new_patient_encoded.columns:
        new_patient_encoded[col] = 0

# Step 4: Remove extra columns not in training
new_patient_aligned = new_patient_encoded[X_train.columns]

# Step 5: Impute missing values
imputed_array = imputer.transform(new_patient_aligned)
new_patient_imputed = pd.DataFrame(imputed_array, columns=X_train.columns)

# Step 6: Predict with XGBoost
pred = xgb_model.predict(new_patient_imputed)
pred_label = label_encoder.inverse_transform(pred)

# Step 7: Output prediction
print("🧠 Predicted Recommended Management:", pred_label[0])


NameError: name 'label_encoder' is not defined

In [39]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import joblib

# === Define columns ===
categorical_cols = ['Region', 'Symptoms', 'Facility', 'Has Insurance',
                    'Menopause Status', 'Ultrasound Features', 'Reported Symptoms']
numerical_cols = ['Age', 'Cyst Size (cm)', 'Cyst Growth Rate cm/month', 'CA 125 Level',
                  'Base Cost (KES)', 'Insurance Copay (KES)', 'Out-of-Pocket (KES)']

# === Preprocessors ===
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# === Label encode the target ===
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# === Full pipeline ===
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb_model)  # or RandomForestClassifier()
])

# === Fit on training data ===
pipeline.fit(X, y_encoded)

# Save model and label encoder
joblib.dump(pipeline, 'model_pipeline.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')


ValueError: A given column is not a column of the dataframe