In [None]:
# Step 1: Install required libraries (uncomment and run if not already installed)
!pip install lazypredict scikit-learn pandas openpyxl

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from lazypredict.Supervised import LazyClassifier

# Step 2: Load the dataset (CSV or Excel)
# Replace with your actual filename
filename = "/content/AIDS_Classification.csv"  # or "your_dataset.csv"

if filename.endswith(".csv"):
    df = pd.read_csv(filename)
elif filename.endswith(".xlsx"):
    df = pd.read_excel(filename)
else:
    raise ValueError("Unsupported file format. Use CSV or Excel.")

# Step 3: Drop rows with missing target and fill other missing values
df = df.dropna(subset=['infected'])  # drop rows where 'infected' is missing
df.fillna(df.mean(numeric_only=True), inplace=True)

# Step 4: Define features and target
X = df.drop(columns=['infected'])  # input features
y = df['infected']  # target label

# Step 5: Handle non-numeric data (if any)
X = pd.get_dummies(X, drop_first=True)

# Step 6: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 7: Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Step 8: Run LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Step 9: Show results
print("Model performance comparison:\n")
print(models)


Collecting lazypredict
  Downloading lazypredict-0.2.16-py2.py3-none-any.whl.metadata (13 kB)
Collecting pytest-runner (from lazypredict)
  Downloading pytest_runner-6.0.1-py3-none-any.whl.metadata (7.3 kB)
Collecting mlflow>=2.0.0 (from lazypredict)
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow>=2.0.0->lazypredict)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow>=2.0.0->lazypredict)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow>=2.0.0->lazypredict)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow>=2.0.0->lazypredict)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-ski

  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 365, number of negative: 1132
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000426 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1880
[LightGBM] [Info] Number of data points in the train set: 1497, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243821 -> initscore=-1.131844
[LightGBM] [Info] Start training from score -1.131844
Model performance comparison:

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.88               0.81     0.81      0.88   
XGBClassifier                      0.88               0.81     0.81      0.87   
AdaBoostClassifier                 0.88               0.81     0.81      0.88   
SGDClassif