In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import time

# 1. Load the dataset
df = pd.read_csv('dataset.csv')

# 2. Filter out rare diseases (keep only diseases with ≥2 cases)
min_samples = 2
disease_counts = df['diseases'].value_counts()
common_diseases = disease_counts[disease_counts >= min_samples].index
df_filtered = df[df['diseases'].isin(common_diseases)]

# 3. Encode disease labels
le = LabelEncoder()
y = le.fit_transform(df_filtered['diseases'])  # Convert disease names to numbers
X = df_filtered.drop('diseases', axis=1)       # Features (all columns except 'diseases')

In [5]:
# 4. Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,  # For reproducibility
    stratify=y       # Maintains disease distribution in splits
)

print(f"\nData Summary:")
print(f"- Training samples: {X_train.shape[0]}")
print(f"- Test samples: {X_test.shape[0]}")
print(f"- Number of diseases: {len(le.classes_)}")
print(f"- Number of symptoms/features: {X_train.shape[1]}")


Data Summary:
- Training samples: 197540
- Test samples: 49386
- Number of diseases: 754
- Number of symptoms/features: 377


In [6]:
# 5. Configure XGBoost model
# 5. Configure XGBoost model for CPU inference
model = XGBClassifier(
    objective='multi:softmax',    # Multi-class classification
    num_class=len(le.classes_),  # Number of diseases
    n_estimators=200,            # Number of trees
    max_depth=6,                 # Tree depth
    learning_rate=0.1,           # Learning rate
    tree_method='hist',          # Fast histogram algorithm (CPU-compatible)
    predictor='cpu_predictor',   # Force CPU prediction
    eval_metric='mlogloss',      # Evaluation metric
    early_stopping_rounds=10,    # Early stopping
    verbosity=1                  # Logging level
)


In [None]:
# 6. Train the model
print("\nStarting training...")
start_time = time.time()

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],  # Evaluate on test set
    verbose=True                  # Show progress
)

training_time = (time.time() - start_time) / 60
print(f"\nTraining completed in {training_time:.1f} minutes")


Starting training...


Parameters: { "predictor" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	validation_0-mlogloss:3.55700
[1]	validation_0-mlogloss:2.60406
[2]	validation_0-mlogloss:2.23674
[3]	validation_0-mlogloss:1.99859
[4]	validation_0-mlogloss:1.81792
[5]	validation_0-mlogloss:1.67469
[6]	validation_0-mlogloss:1.55582
[7]	validation_0-mlogloss:1.45494
[8]	validation_0-mlogloss:1.36858
[9]	validation_0-mlogloss:1.29275
[10]	validation_0-mlogloss:1.22544
[11]	validation_0-mlogloss:1.16485
[12]	validation_0-mlogloss:1.10991
[13]	validation_0-mlogloss:1.06089
[14]	validation_0-mlogloss:1.01673
[15]	validation_0-mlogloss:0.97631
[16]	validation_0-mlogloss:0.93912
[17]	validation_0-mlogloss:0.90471
[18]	validation_0-mlogloss:0.87321
[19]	validation_0-mlogloss:0.84412
[20]	validation_0-mlogloss:0.81739
[21]	validation_0-mlogloss:0.79288
[22]	validation_0-mlogloss:0.77019
[23]	validation_0-mlogloss:0.74902
[24]	validation_0-mlogloss:0.72952
[25]	validation_0-mlogloss:0.71118
[26]	validation_0-mlogloss:0.69424
[27]	validation_0-mlogloss:0.67828
[28]	validation_0-mlogloss:0.6

In [None]:
import joblib
joblib.dump(model, 'disease_predictor_model.pkl')
joblib.dump(le, 'label_encoder.pkl')


NameError: name 'model' is not defined

In [None]:
#Calculate Accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

NameError: name 'y_test' is not defined