Complex Neural Network

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU
import numpy as np
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Dropout

# Specify the encoding when reading the CSV file
df = pd.read_csv('diseases.csv', encoding='latin-1') # or 'cp1252' if 'latin-1' doesn't work

# Feature selection (Symptoms)
X = df[['symptom_1', 'symptom_2', 'symptom_3', 'symptom_4']].fillna('')  # Fill missing symptom columns with empty string if any

# Target selection (Diseases)
y = df['Diseases']

# Encode categorical variables (both symptoms and diseases)
X_encoded = pd.get_dummies(X, columns=['symptom_1', 'symptom_2', 'symptom_3', 'symptom_4'])
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)


# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_encoded)
X_test = scaler.transform(X_test)

model1 = Sequential()
model1.add(Dense(256, input_dim=X_train.shape[1]))  # Increased number of units
model1.add(LeakyReLU(alpha=0.01))
model1.add(Dropout(0.3))  # Adding dropout to prevent overfitting

model1.add(Dense(128))
model1.add(LeakyReLU(alpha=0.01))
model1.add(Dropout(0.3))

model1.add(Dense(64, activation='relu'))
model1.add(Dropout(0.3))

model1.add(Dense(32, activation='relu'))

model1.add(Dense(len(np.unique(y_encoded)), activation='softmax'))

# Compile the model
model1.compile(optimizer=RMSprop(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model1.fit(X_encoded, y_encoded, epochs=100, batch_size=32, validation_split=0.2, verbose=1)
loss, accuracy = model1.evaluate(X_test, y_test, verbose=0)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 72ms/step - accuracy: 0.0000e+00 - loss: 4.8228 - val_accuracy: 0.0000e+00 - val_loss: 4.8350
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0277 - loss: 4.8063 - val_accuracy: 0.0000e+00 - val_loss: 4.8430
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.0155 - loss: 4.8013 - val_accuracy: 0.0000e+00 - val_loss: 4.8537
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0185 - loss: 4.7899 - val_accuracy: 0.0000e+00 - val_loss: 4.8641
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0216 - loss: 4.7824 - val_accuracy: 0.0000e+00 - val_loss: 4.8788
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.0310 - loss: 4.7696 - val_accuracy: 0.0000e+00 - val_loss: 4.8967
Epoch 7/100


Accuracy Finder

In [None]:
import pandas as pd

# Load the test data CSV file
test_file_path = 'diseases.csv'  # Replace with your file path
test_df = pd.read_csv(test_file_path, encoding='latin1')

# Assume 'Symptom_1', 'Symptom_2', etc. are the columns in your test data
test_df = pd.get_dummies(test_df, columns=['symptom_1', 'symptom_2', 'symptom_3', 'symptom_4','symptom_5'])

# Align the test data with the training data columns (filling missing columns with 0)
X_test_encoded = test_df.reindex(columns=X_encoded.columns, fill_value=0)
# Standardize the test data using the same scaler as in training
X_test_scaled = scaler.transform(X_test_encoded)

y_pred = model1.predict(X_test_scaled)

# Get the predicted disease classes
y_pred_class = np.argmax(y_pred, axis=1)

# Convert back to disease names
predicted_diseases = le.inverse_transform(y_pred_class)
print(predicted_diseases)
y_true = test_df['Diseases']
accuracy = accuracy_score(y_true, predicted_diseases)
print(f"Model Accuracy on Test Data: {accuracy * 100:.2f}%")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
['Influenza' 'Diabetes Mellitus' 'Hypertension' 'Asthma' 'Chickenpox'
 'Tuberculosis' 'Hepatitis' 'Heart Disease' 'Arthritis' 'Allergies'
 'Depression' 'Anxiety Disorders' 'Gastroesophageal Reflux Disease (GERD)'
 'Stroke' 'Pneumonia' 'Celiac Disease' 'Fibromyalgia' 'Multiple Sclerosis'
 'Kidney Disease' 'Skin Cancer' 'Lyme Disease' 'Sickle Cell Anemia'
 'Parkinson\x92s Disease' 'HIV/AIDS' 'Zika Virus' 'Alzheimers Disease'
 'Chronic Fatigue Syndrome' 'Psoriasis' 'Irritable Bowel Syndrome'
 'Migraine' 'Anemia' 'Gout' 'Bronchitis' 'Sinusitis' 'Hyperthyroidism'
 'Hypothyroidism' 'Gallbladder Disease' 'Pancreatitis' 'Meningitis'
 'Ulcerative Colitis' 'Crohn\x92s Disease' 'Lupus' 'Eczema' 'Pancreatitis'
 'Deep Vein Thrombosis (DVT)' 'Glaucoma' 'Cataracts'
 'Retinitis Pigmentosa' 'Macular Degeneration' 'Tetanus' 'Whooping Cough'
 'Leprosy' 'Ebola Virus Disease' 'Dengue Fever' 'Malaria' 'Cholera'
 'Rabies' 'Anthrax' 'Legi

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train RandomForest model
rf_model = RandomForestClassifier(n_estimators=125, random_state=42)
rf_model.fit(X_encoded, y_encoded)

# Evaluate RandomForest on the test set
rf_accuracy = rf_model.score(X_test, y_test)
print(f"RandomForest Model Accuracy: {rf_accuracy * 100:.2f}%")


RandomForest Model Accuracy: 100.00%




Random Forest with Grid Search

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold  # Import KFold

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [150, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize RandomForest with GridSearch
rf_model = RandomForestClassifier(random_state=42)
# Use KFold instead of StratifiedKFold
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=KFold(n_splits=3), n_jobs=-1, verbose=2)
#cv=3 defaults to StratifiedKFold. Using KFold will create folds without considering class proportions

# Train the model
grid_search.fit(X_encoded, y_encoded)

# Best parameters and accuracy
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best accuracy: {grid_search.best_score_ * 100:.2f}%")

# Evaluate on the test set
y_pred = grid_search.best_estimator_.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred)
print(f"Final Test Accuracy: {final_accuracy * 100:.2f}%")

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Best accuracy: 0.00%
Final Test Accuracy: 100.00%




Feature Selection

In [None]:
#feature selection:
from sklearn.utils.class_weight import compute_class_weight

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

# Train the neural network with class weights
model.fit(X_train, y_train, epochs=150, batch_size=32, validation_split=0.2, class_weight=class_weights_dict, verbose=1)


Epoch 1/150
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 374ms/step - accuracy: 0.9618 - loss: 0.1332 - val_accuracy: 0.0000e+00 - val_loss: 11.3757
Epoch 2/150
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - accuracy: 1.0000 - loss: 0.0452 - val_accuracy: 0.0000e+00 - val_loss: 11.4309
Epoch 3/150
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step - accuracy: 0.9757 - loss: 0.1250 - val_accuracy: 0.0000e+00 - val_loss: 11.5050
Epoch 4/150
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - accuracy: 0.9757 - loss: 0.0737 - val_accuracy: 0.0000e+00 - val_loss: 11.4724
Epoch 5/150
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step - accuracy: 0.9757 - loss: 0.1248 - val_accuracy: 0.0000e+00 - val_loss: 11.4930
Epoch 6/150
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.9861 - loss: 0.0534 - val_accuracy: 0.0000e+00 - val_loss: 11.5633
Epoch

<keras.src.callbacks.history.History at 0x7aa3a6641ab0>

Diseases Prediction

In [None]:
new_symptoms = ['Wheezing', 'Coughing', 'Chest tightness','Shortness of breath','','']

new_symptoms_df = pd.DataFrame([new_symptoms], columns=['symptom_1', 'symptom_2', 'symptom_3','symptom_4', 'symptom_5', 'symptom_6'])

# Fill any missing symptom columns with empty strings to match the training data
# One-hot encode the new symptoms using the same method as the training data
new_symptoms_encoded = pd.get_dummies(new_symptoms_df, columns=['symptom_1', 'symptom_2', 'symptom_3', 'symptom_4', 'symptom_5', 'symptom_6'])

# Align the new symptoms dataframe with the training encoded dataframe
# This ensures that all the columns used in training are present in the new data (with missing ones filled with 0)
X_new_symptoms = new_symptoms_encoded.reindex(columns=X_encoded.columns, fill_value=0)

# Standardize the new symptoms using the same scaler as the training data
X_new_symptoms_scaled = scaler.transform(X_new_symptoms)

# Predict the disease using the trained model
predicted_class = np.argmax(model.predict(X_new_symptoms_scaled), axis=1)

# Decode the predicted class back to the disease name
predicted_disease = le.inverse_transform(predicted_class)

# Output the predicted disease
print(f"Predicted Disease: {predicted_disease[0]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Predicted Disease: Asthma


Accuracy finder

In [None]:
import pandas as pd

# Load the test data CSV file
test_file_path = 'diseases.csv'
test_df = pd.read_csv(test_file_path, encoding='latin1')

test_df = pd.get_dummies(test_df, columns=['symptom_1', 'symptom_2', 'symptom_3', 'symptom_4','symptom_5'])
X_test_encoded = test_df.reindex(columns=X_encoded.columns, fill_value=0)
# Standardize the test data using the same scaler as in training
X_test_scaled = scaler.transform(X_test_encoded)

y_pred = grid_search.best_estimator_.predict(X_test_scaled)
predicted_diseases = le.inverse_transform(y_pred)

print(predicted_diseases)
y_true = test_df['Diseases']
accuracy = accuracy_score(y_true, predicted_diseases)
print(f"Model Accuracy on Test Data: {accuracy * 100:.2f}%")

['Influenza' 'Diabetes Mellitus' 'Hypertension' 'Asthma' 'Chickenpox'
 'Tuberculosis' 'Hepatitis' 'Heart Disease' 'Arthritis' 'Allergies'
 'Depression' 'Anxiety Disorders' 'Gastroesophageal Reflux Disease (GERD)'
 'Stroke' 'Pneumonia' 'Celiac Disease' 'Fibromyalgia' 'Multiple Sclerosis'
 'Kidney Disease' 'Skin Cancer' 'Lyme Disease' 'Sickle Cell Anemia'
 'Parkinson\x92s Disease' 'HIV/AIDS' 'Zika Virus' 'Alzheimers Disease'
 'Chronic Fatigue Syndrome' 'Psoriasis' 'Irritable Bowel Syndrome'
 'Migraine' 'Anemia' 'Gout' 'Bronchitis' 'Sinusitis' 'Hyperthyroidism'
 'Hypothyroidism' 'Gallbladder Disease' 'Pancreatitis' 'Meningitis'
 'Ulcerative Colitis' 'Crohn\x92s Disease' 'Lupus' 'Eczema' 'Pancreatitis'
 'Deep Vein Thrombosis (DVT)' 'Glaucoma' 'Cataracts'
 'Retinitis Pigmentosa' 'Macular Degeneration' 'Tetanus' 'Whooping Cough'
 'Leprosy' 'Ebola Virus Disease' 'Dengue Fever' 'Malaria' 'Cholera'
 'Rabies' 'Anthrax' 'Legionnaires\x92 Disease' 'Hantavirus'
 'Toxoplasmosis' 'Shingles' 'Measles'



Non Accurate results

In [None]:
j=0
for i in predicted_diseases:
    if i != y_true[j]:
      print(y_true[j]," ",i)
    j+=1

Diverticulitis   Pancreatitis


In [None]:
symptom_columns = ['symptom_1', 'symptom_2', 'symptom_3', 'symptom_4', 'symptom_5']

# Combine all symptoms from the symptom columns into a single series
all_symptoms = pd.concat([df[col] for col in symptom_columns])

# Get unique symptoms by removing duplicates
unique_symptoms = all_symptoms.unique()

# Print the unique symptoms
print(unique_symptoms)
print(len(unique_symptoms))

['Fever' 'Increased thirst' 'Headaches' 'Wheezing' 'Itchy rash' 'Coughing'
 'Fatigue' 'Chest pain' 'Joint pain' 'Sneezing' 'Persistent sadness'
 'Excessive worry' 'Heartburn' 'Sudden numbness' 'Cough' 'Diarrhea'
 'Widespread pain' 'Numbness' 'Swelling' 'New or changing moles'
 'Pain episodes' 'Tremors' 'Flu-like symptoms' 'Memory loss'
 'Extreme fatigue' 'Red patches' 'Abdominal pain' 'Severe headache'
 'Persistent cough' 'Nasal congestion' 'Weight loss' 'Itchy skin'
 'Leg pain' 'Eye pain' 'Clouded vision' 'Vision loss' 'Blurred vision'
 'Muscle stiffness' 'Severe cough' 'Skin lesions' 'High fever'
 'Sore throat' 'Painful rash' 'Swollen salivary glands' 'Rash'
 'Excessive bleeding' 'Skin tightening' 'Weight gain' 'Anxiety'
 'Bone fractures' 'Tall stature' 'Bone pain' 'Jaundice'
 'Ringing in the ears' 'Cold fingers/toes' 'Loss of skin color'
 'Facial weakness' 'Vomiting' 'Weakness in legs' 'Blisters'
 'Hypermobile joints' 'Dry eyes' 'Coughing blood' 'Difficulty swallowing'
 'Severe weak