In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import joblib

# Load full dataset
df = pd.read_csv('/mnt/k/ml/clg_ml/symptom_based_diseaese_detection/Training.csv')

# Split into features and labels
X = df.drop('prognosis', axis=1)
y = df['prognosis']

# Encode the target labels
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Stratified split (80% train, 20% test)
X_train, X_test, y_train_enc, y_test_enc = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

# Train model on all 132 features
rf_full = RandomForestClassifier(n_estimators=100, random_state=42)
rf_full.fit(X_train, y_train_enc)

# Evaluate initial model
preds_full = rf_full.predict(X_test)
print("\n[Initial Model - All 132 Features]")
print("Accuracy:", accuracy_score(y_test_enc, preds_full))
print(classification_report(y_test_enc, preds_full, zero_division=0))

# Feature importance → top 30 features
importances = rf_full.feature_importances_
top_indices = importances.argsort()[::-1][:30]
top_features = X.columns[top_indices]

print("\nTop 30 selected symptoms:")
print(list(top_features))

# Reduce to top 30 features
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

# Retrain model on top 30 features
rf_top = RandomForestClassifier(n_estimators=100, random_state=42)
rf_top.fit(X_train_top, y_train_enc)

# Evaluate retrained model
preds_top = rf_top.predict(X_test_top)
print("\n[Retrained Model - Top 30 Features]")
print("Accuracy:", accuracy_score(y_test_enc, preds_top))
print(classification_report(y_test_enc, preds_top, zero_division=0))

# Save model and label encoder
joblib.dump(rf_top, 'final_rf_model_top30.pkl')
joblib.dump(le, 'label_encoder.pkl')

# Save selected symptoms
top_features.to_series().to_csv("selected_symptoms.csv", index=False)

print("\n✅ Model, label encoder, and top 30 symptom list saved successfully.")


[Initial Model - All 132 Features]
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        24
           1       1.00      1.00      1.00        24
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00        24
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        24
           6       1.00      1.00      1.00        24
           7       1.00      1.00      1.00        24
           8       1.00      1.00      1.00        24
           9       1.00      1.00      1.00        24
          10       1.00      1.00      1.00        24
          11       1.00      1.00      1.00        24
          12       1.00      1.00      1.00        24
          13       1.00      1.00      1.00        24
          14       1.00      1.00      1.00        24
          15       1.00      1.00      1.00        24
          16       1.00      1.