In [None]:
# import pandas as pd
# import numpy as np
# import tensorflow as tf
# from tensorflow import keras
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
# import joblib

# # 1. Load the Data
# print("Loading dataset...")
# try:
#     df = pd.read_csv('dataset.csv')
#     print("Dataset loaded successfully.")
# except FileNotFoundError:
#     print("‚ùå ERROR: 'dataset.csv' not found. Please put it in the ml_model folder.")
#     exit()

# # 2. DATA CLEANING (The Fix is here üõ†Ô∏è)
# print("Cleaning and restructuring data...")

# # Get all unique symptoms from the entire dataset
# cols = [i for i in df.columns if 'Symptom' in i]

# # Melt the data to get a long list of all symptoms per disease
# tmp = pd.melt(df.reset_index(), id_vars=['index'], value_vars=cols)
# tmp['value'] = tmp['value'].str.strip() # Remove extra spaces

# # Create a binary matrix (0s and 1s)
# dummies = pd.get_dummies(tmp, columns=['value'], prefix='', prefix_sep='')

# # FIX: We use numeric_only=True to prevent it from smashing strings together
# combined = dummies.groupby('index').sum(numeric_only=True)

# # Attach the Disease Label back
# y_raw = df['Disease']
# X_clean = combined

# # Align the index just in case
# y_raw = y_raw.iloc[combined.index]

# print(f"Data cleaned! Found {X_clean.shape[1]} unique symptoms.")

# # 3. Encode the Target (Disease Names -> Numbers)
# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y_raw)

# # 4. Split Data & FORCE FLOAT TYPE
# # Now that we fixed step 2, this will work perfectly.
# X_values = X_clean.values.astype('float32') 
# y_encoded = y_encoded.astype('int32')

# X_train, X_test, y_train, y_test = train_test_split(X_values, y_encoded, test_size=0.2, random_state=42)

# # 5. Build the TensorFlow Model
# model = keras.Sequential([
#     keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
#     keras.layers.Dense(64, activation='relu'),
#     keras.layers.Dense(len(np.unique(y_encoded)), activation='softmax')
# ])

# model.compile(optimizer='adam',
#               loss='sparse_categorical_crossentropy',
#               metrics=['accuracy'])

# # 6. Train
# print("Training Neural Network...")
# model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# # 7. Save Everything
# print("Saving model files...")
# model.save('healthcare_model.keras')
# joblib.dump(label_encoder, 'label_encoder.pkl')
# joblib.dump(X_clean.columns, 'features.pkl')

# print("‚úÖ SUCCESS! Model trained on Pranay Patil's dataset.")


# 70-30 testing--------------------------------------------------------------------------------------------------------
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Load Data
print("Loading dataset...")
try:
    df = pd.read_csv('dataset.csv')
except FileNotFoundError:
    print("‚ùå Error: dataset.csv not found!")
    exit()

# 2. Data Cleaning
print("Cleaning Data...")
cols = [i for i in df.columns if 'Symptom' in i]
tmp = pd.melt(df.reset_index(), id_vars=['index'], value_vars=cols)
tmp['value'] = tmp['value'].str.strip()
dummies = pd.get_dummies(tmp, columns=['value'], prefix='', prefix_sep='')
combined = dummies.groupby('index').sum(numeric_only=True)

y_raw = df['Disease']
X_clean = combined
y_raw = y_raw.iloc[combined.index]

# 3. Encode Target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_raw)

# 4. THE 70/30 SPLIT
# test_size=0.3 means 30% is hidden for testing, 70% is used for training
X_values = X_clean.values.astype('float32')
y_encoded = y_encoded.astype('int32')

X_train, X_test, y_train, y_test = train_test_split(X_values, y_encoded, test_size=0.3, random_state=42)

print(f"Total Data: {len(X_values)} rows")
print(f"Training Data (70%): {len(X_train)} rows")
print(f"Testing Data (30%): {len(X_test)} rows")

# 5. Build Model
model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(len(np.unique(y_encoded)), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 6. Train
print("\nTraining Model (Epochs)...")
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# 7. SAVE EVERYTHING
print("\nSaving Model...")
model.save('healthcare_model.keras')
joblib.dump(label_encoder, 'label_encoder.pkl')
joblib.dump(X_clean.columns, 'features.pkl')

# ==========================================
# 8. GENERATE PERFORMANCE REPORT üìä
# ==========================================
print("\n--- TEST RESULTS (30% DATA) ---")

# Predict on the 30% hidden data
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Calculate Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"‚úÖ Final Test Accuracy: {acc * 100:.2f}%")

# Detailed Report (Precision, Recall, F1-Score)
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("‚úÖ DONE! You can take a screenshot of this report.")

Loading dataset...
Cleaning Data...
Total Data: 4920 rows
Training Data (70%): 3444 rows
Testing Data (30%): 1476 rows


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Training Model (Epochs)...
Epoch 1/20
[1m108/108[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.7584 - loss: 2.1783 - val_accuracy: 1.0000 - val_loss: 0.4114
Epoch 2/20
[1m108/108[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 1.0000 - loss: 0.1173 - val_accuracy: 1.0000 - val_loss: 0.0331
Epoch 3/20
[1m108/108[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0206 - val_accuracy: 1.0000 - val_loss: 0.0129
Epoch 4/20
[1m108/108[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0096 - val_accuracy: 1.0000 - val_loss: 0.0072
Epoch 5/20
[1m108/108[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0057 - va