In [27]:
!pip install imbalanced-learn
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
    --------------------------------------- 2.1/124.9 MB 10.7 MB/s eta 0:00:12
   - -------------------------------------- 3.7/124.9 MB 8.7 MB/s eta 0:00:14
   - -------------------------------------- 5.0/124.9 MB 8.4 MB/s eta 0:00:15
   - -------------------------------------- 5.5/124.9 MB 7.3 MB/s eta 0:00:17
   - -------------------------------------- 6.0/124.9 MB 6.0 MB/s eta 0:00:20
   -- ------------------------------------- 7.1/124.9 MB 5.7 MB/s eta 0:00:21
   -- ------------------------------------- 8.9/124.9 MB 6.0 MB/s eta 0:00:20
   --- ------------------------------------ 10.0/124.9 MB 6.0 MB/s eta 0:00:20
   --- ------------------------------------ 10.5/124.9 MB 5.7 MB/s eta 0:00:20
   --- ------------------------------------ 11.5/124.9 MB 5.5 MB/s eta 0:00:

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# Load dataset
file_path = "../dataset/Dengue-Dataset.csv"
df = pd.read_csv(file_path)

# Encode categorical variables
label_encoder = LabelEncoder()
df["Gender"] = label_encoder.fit_transform(df["Gender"])  # Male: 1, Female: 0
df["Result"] = label_encoder.fit_transform(df["Result"])  # Positive: 1, Negative: 0

# Separate features and target
X = df.drop(columns=["Result"])
y = df["Result"]

# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Train a Random Forest classifier with improved hyperparameters
model = RandomForestClassifier(
    n_estimators=50,  # Increase the number of trees
    max_depth=20,  # Limit tree depth to prevent overfitting
    min_samples_split=5,  # Require more samples to split nodes
    min_samples_leaf=2,  # Require more samples per leaf
    class_weight="balanced",
    random_state=42
)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
# print("Classification Report:\n", report)
# print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.85
