In [None]:
pip install pandas scikit-learn xgboost




In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
file_path = 'modified_Final.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)

# Preprocess non-numeric columns
if 'LOOKUPVALUE' in df.columns:
    label_encoder = LabelEncoder()
    df['LOOKUPVALUE'] = label_encoder.fit_transform(df['LOOKUPVALUE'].astype(str))  # Convert to numeric

# Separate features (X) and labels (y)
feature_columns = ['AGE', 'LOOKUPVALUE', 'MCHC', 'BASOPHIL', 'ABSOLUTE NEUTROPHIL COUNT',
                   'RDW-CV', 'ABSOLUTE MONOCYTE COUNT', 'MONOCYTE', 'ABSOLUTE EOSINOPHIL COUNT',
                   'ABSOLUTE BASOPHIL COUNT', 'WBC COUNT', 'NEUTROPHIL', 'LYMPHOCYTE',
                   'EOSINOPHIL', 'RBC COUNT', 'HAEMOGLOBIN', 'MCV', 'MCH', 'RDW-SD',
                   'PLATELET COUNT', 'ABSOLUTE LYMPHOCYTE COUNT', 'PCV']  # Features

label_columns = ['No Major Condition Detected', 'Iron Deficiency Anemia', 'Hemolytic Anemia', 'Vitamin B12 & Folate Deficiency',
                 'Chronic Kidney Disease', 'Thalassemia', 'Sepsis', 'Liver Disease',
                 'Dengue', 'Malaria', 'Aplastic Anemia', 'Leukemia',
                 'Multiple Myeloma', 'Myelodysplastic Syndrome',
                 'Pernicious Anemia', 'General Infection',
                 'Hypothyroidism', 'Possible Autoimmune Disease']  # Labels

X = df[feature_columns]
y = df[label_columns]

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost model for multi-label classification
model = xgb.XGBClassifier(objective='binary:logistic')  # Use binary logistic regression for multi-label classification

# Train the model on each disease label separately (binary relevance approach)
predictions = {}
for label in label_columns:
    print(f"Training model for {label}...")
    model.fit(X_train, y_train[label])
    y_pred = model.predict(X_test)
    predictions[label] = y_pred

    # Evaluate the model
    print(f"Accuracy for {label}: {accuracy_score(y_test[label], y_pred):.2f}")
    print(classification_report(y_test[label], y_pred))

# Combine predictions into a DataFrame for further analysis
predictions_df = pd.DataFrame(predictions)

# Save predictions to a CSV file
output_file_path = "xgboost_predictions.csv"
predictions_df.to_csv(output_file_path, index=False)

print(f"Predictions saved to {output_file_path}")


Training model for No Major Condition Detected...
Accuracy for No Major Condition Detected: 0.78
              precision    recall  f1-score   support

           0       0.79      0.84      0.81      1778
           1       0.75      0.68      0.72      1257

    accuracy                           0.78      3035
   macro avg       0.77      0.76      0.77      3035
weighted avg       0.77      0.78      0.77      3035

Training model for Iron Deficiency Anemia...
Accuracy for Iron Deficiency Anemia: 0.77
              precision    recall  f1-score   support

           0       0.79      0.95      0.86      2282
           1       0.59      0.21      0.31       753

    accuracy                           0.77      3035
   macro avg       0.69      0.58      0.59      3035
weighted avg       0.74      0.77      0.72      3035

Training model for Hemolytic Anemia...
Accuracy for Hemolytic Anemia: 0.79
              precision    recall  f1-score   support

           0       0.80      0.8