In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [None]:
file_path = '/kaggle/input/disease-dataset1/dataset.csv'
dataset = pd.read_csv(file_path)

In [None]:
dataset.head()

In [None]:
import pandas as pd

# Identify symptom columns (assumed to be all columns except 'Disease')
symptom_columns = dataset.columns[1:]  # Assuming 'Disease' is in the first column

# Function to sort symptoms alphabetically (treating NaN as 'Unknown')
def sort_symptoms(row):
    symptoms = row[symptom_columns].fillna('Unknown').astype(str).values  # Convert to string and handle NaN
    symptoms_sorted = sorted(symptoms)  # Sort symptoms alphabetically
    return pd.Series(symptoms_sorted, index=symptom_columns)

# Apply the sorting function to each row
sorted_symptoms_df = dataset.apply(sort_symptoms, axis=1)

# Combine sorted symptoms with the target variable 'Disease'
sorted_data = pd.concat([sorted_symptoms_df, dataset['Disease']], axis=1)

# Display the first few rows of the sorted dataset
print(sorted_data.head())

# Optionally, save the sorted dataset to a new CSV file
# sorted_data.to_csv('sorted_symptom_dataset.csv', index=False)

In [None]:
import csv
import json

# open the csv file
with open("/kaggle/input/disease-dataset1/Symptom-severity.csv") as f:
    reader = csv.reader(f)
    # skip the header
    next(reader)
    # create a list to store the data
    data = []
    # iterate over the rows
    for i,row in enumerate(reader):
        # create a dictionary to store the data
        d = {}
        # store the data in the dictionary
        d['serial'] = i
        d['name'] = row[0]
        d['weight'] = row[1]
        # append the dictionary to the list
        data.append(d)

In [None]:
# Filling missing symptom values with a placeholder 'Unknown'
imputer = SimpleImputer(strategy='constant', fill_value='Unknown')
symptoms = sorted_data.iloc[:, 1:]  # Selecting all symptom columns
symptoms_filled = pd.DataFrame(imputer.fit_transform(symptoms), columns=symptoms.columns)

In [None]:
sorted_data.head()

In [None]:
# Encoding categorical features (symptoms)
encoder = {'itching': 0, 'skin_rash': 1, 'nodal_skin_eruptions': 2, 'continuous_sneezing': 3, 'shivering': 4, 'chills': 5, 'joint_pain': 6, 'stomach_pain': 7, 'acidity': 8, 'ulcers_on_tongue': 9, 'muscle_wasting': 10, 'vomiting': 11, 'burning_micturition': 12, 'spotting_urination': 13, 'fatigue': 14, 'weight_gain': 15, 'anxiety': 16, 'cold_hands_and_feets': 17, 'mood_swings': 18, 'weight_loss': 19, 'restlessness': 20, 'lethargy': 21, 'patches_in_throat': 22, 'irregular_sugar_level': 23, 'cough': 24, 'high_fever': 25, 'sunken_eyes': 26, 'breathlessness': 27, 'sweating': 28, 'dehydration': 29, 'indigestion': 30, 'headache': 31, 'yellowish_skin': 32, 'dark_urine': 33, 'nausea': 34, 'loss_of_appetite': 35, 'pain_behind_the_eyes': 36, 'back_pain': 37, 'constipation': 38, 'abdominal_pain': 39, 'diarrhoea': 40, 'mild_fever': 41, 'yellow_urine': 42, 'yellowing_of_eyes': 43, 'acute_liver_failure': 44, 'fluid_overload': 117, 'swelling_of_stomach': 46, 'swelled_lymph_nodes': 47, 'malaise': 48, 'blurred_and_distorted_vision': 49, 'phlegm': 50, 'throat_irritation': 51, 'redness_of_eyes': 52, 'sinus_pressure': 53, 'runny_nose': 54, 'congestion': 55, 'chest_pain': 56, 'weakness_in_limbs': 57, 'fast_heart_rate': 58, 'pain_during_bowel_movements': 59, 'pain_in_anal_region': 60, 'bloody_stool': 61, 'irritation_in_anus': 62, 'neck_pain': 63, 'dizziness': 64, 'cramps': 65, 'bruising': 66, 'obesity': 67, 'swollen_legs': 68, 'swollen_blood_vessels': 69, 'puffy_face_and_eyes': 70, 'enlarged_thyroid': 71, 'brittle_nails': 72, 'swollen_extremeties': 73, 'excessive_hunger': 74, 'extra_marital_contacts': 75, 'drying_and_tingling_lips': 76, 'slurred_speech': 77, 'knee_pain': 78, 'hip_joint_pain': 79, 'muscle_weakness': 80, 'stiff_neck': 81, 'swelling_joints': 82, 'movement_stiffness': 83, 'spinning_movements': 84, 'loss_of_balance': 85, 'unsteadiness': 86, 'weakness_of_one_body_side': 87, 'loss_of_smell': 88, 'bladder_discomfort': 89, 'foul_smell_ofurine': 90, 'continuous_feel_of_urine': 91, 'passage_of_gases': 92, 'internal_itching': 93, 'toxic_look_(typhos)': 94, 'depression': 95, 'irritability': 96, 'muscle_pain': 97, 'altered_sensorium': 98, 'red_spots_over_body': 99, 'belly_pain': 100, 'abnormal_menstruation': 101, 'dischromic_patches': 102, 'watering_from_eyes': 103, 'increased_appetite': 104, 'polyuria': 105, 'family_history': 106, 'mucoid_sputum': 107, 'rusty_sputum': 108, 'lack_of_concentration': 109, 'visual_disturbances': 110, 'receiving_blood_transfusion': 111, 'receiving_unsterile_injections': 112, 'coma': 113, 'stomach_bleeding': 114, 'distention_of_abdomen': 115, 'history_of_alcohol_consumption': 116, 'blood_in_sputum': 118, 'prominent_veins_on_calf': 119, 'palpitations': 120, 'painful_walking': 121, 'pus_filled_pimples': 122, 'blackheads': 123, 'scurring': 124, 'skin_peeling': 125, 'silver_like_dusting': 126, 'small_dents_in_nails': 127, 'inflammatory_nails': 128, 'blister': 129, 'red_sore_around_nose': 130, 'yellow_crust_ooze': 131, 'prognosis': 132}
for column in symptoms_filled.columns:
    symptoms_filled[column] = symptoms_filled[column].map(encoder)

In [None]:
# Encoding the target 'Disease'
disease_encoded = encoder.fit_transform(sorted_data['Disease'])

In [None]:
# Combining the preprocessed features and target
preprocessed_data = symptoms_filled.copy()
preprocessed_data = preprocessed_data.reset_index(drop=True)
preprocessed_data['Disease'] = disease_encoded

In [None]:
# Display the first few rows of the preprocessed data
preprocessed_data.head()

In [None]:
# Splitting the dataset into features and target
X = preprocessed_data.drop(columns=['Disease'])
y = preprocessed_data['Disease']

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train.head()

In [None]:
X_train.shape

In [None]:
n_classes = preprocessed_data['Disease'].nunique()
n_classes

In [None]:
import os
import matplotlib.pyplot as plt
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout, BatchNormalization

# Define the model architecture
model = Sequential()
model.add(Input(shape=(n_classes,), name='Input_Layer'))  # Use Input layer here
model.add(Dense(256, activation='relu', name='Dense_Layer_1'))
model.add(BatchNormalization(name='Batch_Normalization_1'))
model.add(Dropout(0.5, name='Dropout_1'))
model.add(Dense(128, activation='relu', name='Hidden_Layer'))
model.add(BatchNormalization(name='Batch_Normalization_2'))
model.add(Dropout(0.5, name='Dropout_2'))
model.add(Dense(n_classes, activation='softmax', name='Output_Layer'))

# Plot the model
plot_model(model, to_file='model_architecture.png', show_shapes=True, show_layer_names=True)

# Display the plot
if os.path.exists('E:/medical_results_app/server/model/model_architecture.png'):
    img = plt.imread('E:/medical_results_app/server/model/model_architecture.png')
    plt.imshow(img)
    plt.axis('off')
    plt.show()
else:
    print("File not found: 'model_architecture.png'. Please check if it was created successfully.")


# Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

# Check the shape of X_train to determine the number of features
n_features = X_train.shape[1]  # This should be the number of input features
n_classes = len(set(y_train))  # Assuming y_train contains class labels

# Build the model
model = Sequential()
model.add(Dense(256, input_shape=(n_features,), activation='relu'))  # Input layer
model.add(BatchNormalization())  # Normalize the output of the previous layer
model.add(Dropout(0.5))  # Dropout to reduce overfitting

model.add(Dense(128, activation='relu'))  # Hidden layer
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(n_classes, activation='softmax'))  # Output layer

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# To ignore warinings
import warnings
warnings.filterwarnings('ignore')

In [None]:
from tensorflow.keras.utils import to_categorical

y_train_one_hot = to_categorical(y_train, num_classes=n_classes)
y_test_one_hot = to_categorical(y_test, num_classes=n_classes)

In [None]:
# Train the model
history = model.fit(X_train, y_train_one_hot, epochs=30, batch_size=32, validation_data=(X_test, y_test_one_hot))

In [None]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

plt.show()

In [None]:
import joblib

# Save the label encoder for symptoms
joblib.dump(encoder, 'symptom_encoder.joblib')

# Save the label encoder for diseases (if different from symptoms)
joblib.dump(encoder, 'disease_encoder.joblib')

# Save the SimpleImputer used for handling missing values
joblib.dump(imputer, 'imputer.joblib')

In [None]:
joblib_file = 'model.joblib'
joblib.dump(model, joblib_file)

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Assuming you have X_test and y_test
n_splits = 5

# Calculate the size of each split
split_size = len(X_test) // n_splits

# Initialize lists to store evaluation metrics for each split
accuracies = []
precisions = []
recalls = []

# Loop through each split
for i in range(n_splits):
    # Define the start and end indices for the split
    start_idx = i * split_size
    if i == n_splits - 1:
        end_idx = len(X_test)  # Include the remainder in the last split
    else:
        end_idx = (i + 1) * split_size
    
    # Get the split for testing
    X_split = X_test[start_idx:end_idx]
    y_split = y_test[start_idx:end_idx]
    
    # Make predictions using the pre-trained model
    y_pred = model.predict(X_split)
    
    # Convert predictions to class labels (assuming softmax probabilities)
    y_pred_labels = np.argmax(y_pred, axis=1)
    
    # Evaluate performance
    accuracies.append(accuracy_score(y_split, y_pred_labels))
    precisions.append(precision_score(y_split, y_pred_labels, average='weighted'))
    recalls.append(recall_score(y_split, y_pred_labels, average='weighted'))

# Print the results
print("Accuracies for each split:", accuracies)
print("Precisions for each split:", precisions)
print("Recalls for each split:", recalls)

# Optionally, calculate average performance across splits
print("Average accuracy:", np.mean(accuracies))
print("Average precision:", np.mean(precisions))
print("Average recall:", np.mean(recalls))

In [None]:
testdf = pd.read_csv("/kaggle/input/medical-test-data/new_unseen_data.csv")

In [None]:
import pandas as pd

# Identify symptom columns (assumed to be all columns except 'Disease')
symptom_columns = dataset.columns[1:]  # Assuming 'Disease' is in the first column

# Function to sort symptoms alphabetically (treating NaN as 'Unknown')
def sort_symptoms(row):
    symptoms = row[symptom_columns].fillna('Unknown').astype(str).values  # Convert to string and handle NaN
    symptoms_sorted = sorted(symptoms)  # Sort symptoms alphabetically
    return pd.Series(symptoms_sorted, index=symptom_columns)

# Apply the sorting function to each row
sorted_symptoms_df = dataset.apply(sort_symptoms, axis=1)

# Combine sorted symptoms with the target variable 'Disease'
sorted_data = pd.concat([sorted_symptoms_df, dataset['Disease']], axis=1)

# Display the first few rows of the sorted dataset
print(sorted_data.head())

# Optionally, save the sorted dataset to a new CSV file
# sorted_data.to_csv('sorted_symptom_dataset.csv', index=False)

In [None]:
# Filling missing symptom values with a placeholder 'Unknown'
imputer = SimpleImputer(strategy='constant', fill_value='Unknown')
symptoms = sorted_data.iloc[:, 1:]  # Selecting all symptom columns
symptoms_filled = pd.DataFrame(imputer.fit_transform(symptoms), columns=symptoms.columns)

In [None]:
# Encoding categorical features (symptoms)
encoder = LabelEncoder()
for column in symptoms_filled.columns:
    symptoms_filled[column] = encoder.fit_transform(symptoms_filled[column])

In [None]:
# Encoding the target 'Disease'
disease_encoded = encoder.fit_transform(sorted_data['Disease'])

In [None]:
# Combining the preprocessed features and target
preprocessed_data = symptoms_filled.copy()
preprocessed_data['Disease'] = disease_encoded

In [None]:
# Splitting the dataset into features and target
X_test = preprocessed_data.drop(columns=['Disease'])
y_test = preprocessed_data['Disease']

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Assuming you have X_test and y_test
n_splits = 5

# Calculate the size of each split
split_size = len(X_test) // n_splits

# Initialize lists to store evaluation metrics for each split
accuracies = []
precisions = []
recalls = []

# Loop through each split
for i in range(n_splits):
    # Define the start and end indices for the split
    start_idx = i * split_size
    if i == n_splits - 1:
        end_idx = len(X_test)  # Include the remainder in the last split
    else:
        end_idx = (i + 1) * split_size
    
    # Get the split for testing
    X_split = X_test[start_idx:end_idx]
    y_split = y_test[start_idx:end_idx]
    
    # Make predictions using the pre-trained model
    y_pred = model.predict(X_split)
    
    # Convert predictions to class labels (assuming softmax probabilities)
    y_pred_labels = np.argmax(y_pred, axis=1)
    
    # Evaluate performance
    accuracies.append(accuracy_score(y_split, y_pred_labels))
    precisions.append(precision_score(y_split, y_pred_labels, average='weighted'))
    recalls.append(recall_score(y_split, y_pred_labels, average='weighted'))

# Print the results
print("Accuracies for each split:", accuracies)
print("Precisions for each split:", precisions)
print("Recalls for each split:", recalls)

# Optionally, calculate average performance across splits
print("Average accuracy:", np.mean(accuracies))
print("Average precision:", np.mean(precisions))
print("Average recall:", np.mean(recalls))