In [3]:
# ===============================
# ðŸ“˜ AI MedBot ML Model Training
# ===============================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import joblib

# --- Load datasets ---
symptom_df = pd.read_excel(r"D:\Machine-Learning-Projects\AI_MEDBOT_1\ml\datasets\data.xlsx")
heart_df1 = pd.read_excel(r"D:\Machine-Learning-Projects\AI_MEDBOT_1\ml\datasets\Heart_disease_statlog.xlsx")
heart_df2 = pd.read_excel(r"D:\Machine-Learning-Projects\AI_MEDBOT_1\ml\datasets\heart.xlsx")

print("âœ… Datasets loaded successfully")
print("Symptom columns:", list(symptom_df.columns))
print("Heart1 columns:", list(heart_df1.columns))
print("Heart2 columns:", list(heart_df2.columns))

# --- Clean and unify datasets ---
# Convert all column names to lowercase for consistency
symptom_df.columns = symptom_df.columns.str.lower()
heart_df1.columns = heart_df1.columns.str.lower()
heart_df2.columns = heart_df2.columns.str.lower()

# Rename targets to a common name
symptom_df = symptom_df.rename(columns={"disease": "target"})
heart_df1 = heart_df1.rename(columns={"target": "target"})
heart_df2 = heart_df2.rename(columns={"heartdisease": "target"})

# --- Align structure ---
# For datasets missing some columns, we fill missing ones with 0 or NaN
common_cols = set(symptom_df.columns) | set(heart_df1.columns) | set(heart_df2.columns)
for df in [symptom_df, heart_df1, heart_df2]:
    for col in common_cols:
        if col not in df.columns:
            df[col] = None

# --- Merge all ---
merged_df = pd.concat([symptom_df, heart_df1, heart_df2], ignore_index=True)

print("âœ… Merged dataset shape:", merged_df.shape)
print("Columns in merged dataset:", merged_df.columns.tolist())

# --- Drop rows with missing targets ---
merged_df = merged_df.dropna(subset=["target"])

# --- Encode categorical features ---
label_encoders = {}
for col in merged_df.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    merged_df[col] = le.fit_transform(merged_df[col].astype(str))
    label_encoders[col] = le

# --- Split into features and target ---
X = merged_df.drop(columns=["target"])
y = merged_df["target"]

# --- Split train/test ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- Train model ---
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

print("âœ… Model trained successfully!")
print("Training accuracy:", model.score(X_train, y_train))
print("Testing accuracy:", model.score(X_test, y_test))

# --- Save model and encoders ---
joblib.dump(model, r"D:\Machine-Learning-Projects\AI_MEDBOT_1\ml\medical_model.pkl")
joblib.dump(label_encoders, r"D:\Machine-Learning-Projects\AI_MEDBOT_1\ml\label_encoders.pkl")

print("ðŸ’¾ Model and encoders saved successfully!")


âœ… Datasets loaded successfully
Symptom columns: ['fever', 'cough', 'fatigue', 'headache', 'sore_throat', 'chest_pain', 'shortness_of_breath', 'disease']
Heart1 columns: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
Heart2 columns: ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope', 'HeartDisease']
âœ… Merged dataset shape: (1197, 29)
Columns in merged dataset: ['fever', 'cough', 'fatigue', 'headache', 'sore_throat', 'chest_pain', 'shortness_of_breath', 'target', 'chol', 'oldpeak', 'cholesterol', 'thal', 'thalach', 'exang', 'sex', 'cp', 'fastingbs', 'maxhr', 'exerciseangina', 'restingbp', 'chestpaintype', 'fbs', 'ca', 'trestbps', 'restecg', 'slope', 'st_slope', 'restingecg', 'age']


  merged_df = pd.concat([symptom_df, heart_df1, heart_df2], ignore_index=True)


âœ… Model trained successfully!
Training accuracy: 1.0
Testing accuracy: 0.8583333333333333
ðŸ’¾ Model and encoders saved successfully!
