In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
df = pd.read_csv('expandedd_dataset.csv')

In [None]:
# Encode categorical variables: 'location', 'month', and 'disease'
label_encoder_location = LabelEncoder()
label_encoder_month = LabelEncoder()
label_encoder_disease = LabelEncoder()

df['location'] = label_encoder_location.fit_transform(df['location'])
df['month'] = label_encoder_month.fit_transform(df['month'])
df['disease'] = label_encoder_disease.fit_transform(df['disease'])

# Feature Scaling: Scale only the numerical features
numerical_features = ['total', 'preasure', 'rain', 'sun', 'humidity', 'mean_temp',
                      'max_temp', 'min_temp', 'wind_gust', 'mean_wind_spd']

scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Step 3: Prepare the data for training

# Define X (features) and y (target)
X = df.drop(columns=['disease', 'ID'])  # Features
y = df['disease']  # Target (disease)

# Split the dataset into training and test sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Initialize individual models
rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression(max_iter=1000, random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('lr', lr_model),
    ('xgb', xgb_model)
], voting='soft')

# Train the voting classifier
voting_clf.fit(X_train, y_train)

# Make predictions
y_pred_voting = voting_clf.predict(X_test)

# Evaluate the model
print(f"Voting Classifier Accuracy: {accuracy_score(y_test, y_pred_voting) * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_voting, target_names=label_encoder_disease.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_voting))


Parameters: { "use_label_encoder" } are not used.



Voting Classifier Accuracy: 24.67%

Classification Report:
               precision    recall  f1-score   support

       Asthma       0.17      0.21      0.19        43
      Cholera       0.14      0.08      0.11        24
    Dysentery       0.00      0.00      0.00        17
     Dysentry       0.30      0.32      0.31        25
  Guinea worm       0.17      0.13      0.15        15
  Guinea_worm       0.00      0.00      0.00         0
      Malaria       0.49      0.54      0.51        54
Skin diseases       0.07      0.04      0.05        26
Skin_diseases       0.23      0.29      0.26        24
      Typhoid       0.19      0.25      0.21        52
 Yellow fever       0.30      0.15      0.20        20
 Yellow_fever       0.00      0.00      0.00         0

     accuracy                           0.25       300
    macro avg       0.17      0.17      0.17       300
 weighted avg       0.23      0.25      0.24       300


Confusion Matrix:
[[ 9  2  1  9  1  0  3  1  1 16  0  0]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grids for each model
param_grid = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [None, 10, 20, 30],
    'lr__C': [0.01, 0.1, 1, 10, 100],
    'xgb__n_estimators': [50, 100, 200],
    'xgb__learning_rate': [0.01, 0.1, 0.2],
    'xgb__max_depth': [3, 6, 10]
}

# Initialize Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('lr', lr_model),
    ('xgb', xgb_model)
], voting='soft')

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=voting_clf, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit the model using GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters from grid search
print("Best Parameters:", grid_search.best_params_)

# Make predictions using the best estimator
y_pred_voting = grid_search.best_estimator_.predict(X_test)

# Evaluate the tuned model
print(f"Voting Classifier Accuracy (Tuned): {accuracy_score(y_test, y_pred_voting) * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_voting, target_names=label_encoder_disease.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_voting))


Fitting 3 folds for each of 1620 candidates, totalling 4860 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'lr__C': 100, 'rf__max_depth': 10, 'rf__n_estimators': 50, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__n_estimators': 200}
Voting Classifier Accuracy (Tuned): 26.00%

Classification Report:
               precision    recall  f1-score   support

       Asthma       0.11      0.12      0.11        43
      Cholera       0.17      0.04      0.07        24
    Dysentery       0.00      0.00      0.00        17
     Dysentry       0.27      0.48      0.34        25
  Guinea worm       0.29      0.13      0.18        15
  Guinea_worm       0.00      0.00      0.00         0
      Malaria       0.48      0.54      0.51        54
Skin diseases       0.25      0.04      0.07        26
Skin_diseases       0.20      0.25      0.22        24
      Typhoid       0.22      0.40      0.29        52
 Yellow fever       0.25      0.05      0.08        20
 Yellow_fever       0.00      0.00      0.00         0

     accuracy                           0.26       300
    macro

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
# Step 2: Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Step 3: Initialize GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Step 4: Fit the GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Step 5: Print the best parameters and best score found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

# Step 6: Make predictions using the best model found by GridSearchCV
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)

# Step 7: Evaluate the tuned model
accuracy = accuracy_score(y_test, y_pred)
print(f"Tuned Model Accuracy: {accuracy * 100:.2f}%")

# Get the unique classes in the test set
unique_classes_test = np.unique(y_test)

# Step 8: Generate the classification report dynamically
# Fetch the class labels corresponding to the unique classes in y_test
class_labels = label_encoder_disease.inverse_transform(unique_classes_test)

print("\nTuned Classification Report:")
print(classification_report(y_test, y_pred, target_names=class_labels))

# Optional: Confusion Matrix
print("\nTuned Confusion Matrix:")
print(confusion_matrix(y_test, y_pred, labels=unique_classes_test))