In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/disease-symptoms-prediction-dataset/Final_Augmented_dataset_Diseases_and_Symptoms.csv


In [10]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import joblib

FILE_PATH = "/kaggle/input/disease-symptoms-prediction-dataset/Final_Augmented_dataset_Diseases_and_Symptoms.csv"
TARGET_COLUMN = 'diseases'
RANDOM_STATE = 42
MODEL_PATH = "rf_model_optimized.joblib"
LE_PATH = "label_encoder_optimized.joblib"

df = pd.read_csv(FILE_PATH)
df = df.fillna(0)

y = df[TARGET_COLUMN]
rare_classes = y.value_counts()[y.value_counts() < 2].index
df_filtered = df[~df[TARGET_COLUMN].isin(rare_classes)]

X = df_filtered.drop(columns=[TARGET_COLUMN])
y = df_filtered[TARGET_COLUMN]

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=RANDOM_STATE, stratify=y_encoded
)

rf = RandomForestClassifier(
    n_estimators=150,
    max_depth=20,
    min_samples_leaf=1,
    max_features='sqrt',
    class_weight='balanced',
    n_jobs=-1,
    random_state=RANDOM_STATE,
    verbose=1
)

print("Training optimized Random Forest...")
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {acc:.4f}")

unique_labels = np.unique(y_test)
report = classification_report(
    y_test,
    y_pred,
    labels=unique_labels,
    target_names=le.inverse_transform(unique_labels),
    zero_division=0,
    output_dict=True
)

print("\nClassification Report (Macro/Weighted Avg):")
print(f"Weighted F1-score: {report['weighted avg']['f1-score']:.4f}")
print(f"Macro Avg Precision: {report['macro avg']['precision']:.4f}")
print(f"Macro Avg Recall: {report['macro avg']['recall']:.4f}")

joblib.dump(rf, MODEL_PATH)
joblib.dump(le, LE_PATH)
print(f"\nOptimized model saved to {MODEL_PATH} and {LE_PATH}")


Training optimized Random Forest...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   21.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:   13.5s finished


Test set accuracy: 0.7034

Classification Report (Macro/Weighted Avg):
Weighted F1-score: 0.7470
Macro Avg Precision: 0.7390
Macro Avg Recall: 0.7791

Optimized model saved to rf_model_optimized.joblib and label_encoder_optimized.joblib


In [11]:
import shutil

shutil.move("rf_model_optimized.joblib", "/kaggle/working/rf_model.joblib")
shutil.move("label_encoder_optimized.joblib", "/kaggle/working/label_encoder.joblib")

print("Files moved to /kaggle/working/ and ready for download!")


Files moved to /kaggle/working/ and ready for download!
