# Importing Necessary Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
import joblib
import os
import time
import warnings
warnings.filterwarnings('ignore')

# Data Preprocessing


In [None]:
df = pd.read_csv('Dataset - Diseases and Symptoms.csv')

### Working with Null Values


In [None]:
null_rows_count = df.isnull().any(axis=1).sum()

null_ratio = null_rows_count / len(df)

print("Total Rows with Null Values:", null_rows_count)
print("Null Values Ratio:", null_ratio)
print("Duplicated Rows: ",df.duplicated().sum())

In [None]:
df_without_duplicate = df.drop_duplicates()

In [None]:
null_rows_count = df_without_duplicate.isnull().any(axis=1).sum()

null_ratio = null_rows_count / len(df_without_duplicate)

print("Total Rows with Null Values:", null_rows_count)
print("Null Values Ratio:", null_ratio)
print("Duplicated Rows: ",df_without_duplicate.duplicated().sum())

In [None]:
symptoms_without_duplicate = df.drop(columns=['diseases'])
diseases_without_duplicate = df['diseases'].value_counts()

In [None]:
most_common_symptoms_without_duplicate = symptoms_without_duplicate.sum().sort_values(ascending=False)
print("Symtoms Count: ", most_common_symptoms_without_duplicate.count())
print((most_common_symptoms_without_duplicate).to_string())

In [None]:
print("Disease Count: ", diseases_without_duplicate.count())
print((diseases_without_duplicate).to_string())

### Feature Engineering


In [None]:
redundant_symptoms = symptoms_without_duplicate.loc[:, symptoms_without_duplicate.sum() < 10]

In [None]:
redundant_symptoms

In [None]:
cleaned_data = df_without_duplicate.drop(columns=list(redundant_symptoms.columns))

In [None]:
print(cleaned_data.info())
print('Shape:',cleaned_data.shape)

In [None]:
disease_counts_cleaned = cleaned_data['diseases'].value_counts()

diseases_to_drop = disease_counts_cleaned[disease_counts_cleaned < 10].index

print("Diseases with count less than 10:")
print(disease_counts_cleaned[diseases_to_drop])

In [None]:
cleaned_data = cleaned_data[~cleaned_data['diseases'].isin(diseases_to_drop)].reset_index(drop=True)

print("\nShape after dropping diseases with count less than 10:", cleaned_data.shape)
print("\nInfo after dropping diseases with count less than 10:")
print(cleaned_data.info())

In [None]:
cleaned_data.info()

In [None]:
cleaned_data.shape

In [None]:
cleaned_data.to_csv('cleaned_data.csv', index=False)

# Data Visualization


In [None]:
plt.figure(figsize=(12, 6))
symptom_sums = cleaned_data.drop(columns=['diseases']).sum().sort_values(ascending=False)
top_10_symptoms = symptom_sums.head(20)
sns.barplot(x=top_10_symptoms.values, y=top_10_symptoms.index, palette='magma')
plt.title('Top 10 Most Common Symptoms')
plt.xlabel('Frequency')
plt.ylabel('Symptom')
plt.tight_layout()
plt.savefig('top_10_symptoms.png')
plt.show()

In [None]:
disease_counts = cleaned_data['diseases'].value_counts()

top_5 = disease_counts.head(10)
bottom_5 = disease_counts.tail(10)

combined_counts = pd.concat([top_5, bottom_5])

colors = sns.color_palette('viridis', n_colors=10)

plt.figure(figsize=(12, 6))
sns.barplot(x=combined_counts.index, y=combined_counts.values, palette=colors)
plt.title('Top 5 and Lowest 5 Diseases by Frequency')
plt.xlabel('Disease')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')

plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()

for i, count in enumerate(combined_counts.values):
    plt.text(i, count + 0.5, str(count), ha='center', va='bottom', fontsize=10)

plt.savefig('top_bottom_diseases.png')
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
symptom_counts = cleaned_data.drop(columns=['diseases']).sum(axis=1)
sns.boxplot(y=symptom_counts, color='#66c2a5')
plt.title('Distribution of Symptom Counts per Disease Instance')
plt.ylabel('Number of Symptoms')
plt.xlabel('All Diseases')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('symptom_counts_boxplot.png')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
disease_freq = cleaned_data['diseases'].value_counts()
symptom_sums = cleaned_data.groupby('diseases').sum().drop(columns=['diseases'], errors='ignore')
avg_symptoms = symptom_sums.mean(axis=1)
unique_symptoms = symptom_sums.gt(0).sum(axis=1)
bubble_data = pd.DataFrame({
    'Frequency': disease_freq,
    'Avg_Symptoms': avg_symptoms,
    'Unique_Symptoms': unique_symptoms
}).reset_index()

sns.scatterplot(data=bubble_data, x='Frequency', y='Avg_Symptoms', size='Unique_Symptoms', sizes=(50, 500), alpha=0.6, hue='Unique_Symptoms', palette='viridis')
plt.title('Disease Frequency vs. Average Symptom Count')
plt.xlabel('Disease Frequency')
plt.ylabel('Average Number of Symptoms')
plt.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.savefig('disease_freq_symptom_bubble.png')
plt.show()

# Model Development


In [None]:
cleaned_data = pd.read_csv('cleaned_data.csv')

In [None]:
X = cleaned_data.drop(columns=['diseases'])
y = cleaned_data['diseases']

In [None]:
X.columns.to_series().to_csv('symptoms.csv', index=False, header=False)
print("CSV file 'symptoms.csv' created successfully.")

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(f"Target encoded. Number of classes: {len(label_encoder.classes_)}")

In [None]:
disease_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
mapping_df = pd.DataFrame(list(disease_mapping.items()), columns=['Encoded', 'Disease'])
mapping_df.to_csv(f'disease_mapping.csv', index=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [None]:
print(f"Training set: {X_train.shape[0]} samples, Test set: {X_test.shape[0]} samples")

### Logistic Regression


In [None]:
# print("\nTraining Logistic Regression...")
# start_time = time.time()
# lr_model = LogisticRegression(max_iter=500, solver='saga',
#                               multi_class='multinomial', n_jobs=-1, random_state=42)
# lr_model.fit(X_train, y_train)
# lr_time = time.time() - start_time
# print(f"Training time: {lr_time:.2f} seconds")

In [None]:
# joblib.dump(lr_model, f"/content/drive/Othercomputers/My Laptop/Projects/intelligent-paitent-diagnostic-system/logistic_regression_model.joblib")

### Decision Tree


In [None]:
# print("\nTraining Decision Tree Classifier...")
# start_time = time.time()
# dt_model = DecisionTreeClassifier(random_state=42)
# dt_model.fit(X_train, y_train)
# dt_time = time.time() - start_time
# print(f"Training time: {dt_time:.2f} seconds")

In [None]:
# joblib.dump(dt_model, f"/content/drive/Othercomputers/My Laptop/Projects/intelligent-paitent-diagnostic-system/decision_tree_model.joblib")

### K-Nearest Neighbors


In [None]:
# print("\nTraining K-Nearest Neighbors Classifier...")
# start_time = time.time()
# knn_model = KNeighborsClassifier(n_neighbors=7)
# knn_model.fit(X_train, y_train)
# knn_time = time.time() - start_time
# print(f"Training time: {knn_time:.2f} seconds")

In [None]:
# joblib.dump(knn_model, f"/content/drive/Othercomputers/My Laptop/Projects/intelligent-paitent-diagnostic-system/knn_model.joblib")

### 1D-CNN


In [None]:
X_train_dl = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_dl = X_test.values.reshape((X_test.shape[0], X_test.shape[1], 1))

In [None]:
model = Sequential([
    Conv1D(32, 3, activation='relu', input_shape=(X.shape[1], 1)),
    MaxPooling1D(2),
    Dropout(0.2),
    Conv1D(64, 3, activation='relu'),
    MaxPooling1D(2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=200, batch_size=64, validation_split=0.2)

In [None]:
model.save('cnn_model.h5')
print("Model saved successfully!")

### Confusion Matrix


In [None]:
def plot_simplified_cm(y_true, y_pred, model_name):
    unique_diseases = np.unique(np.concatenate([y_true, y_pred]))
    num_diseases_to_show = min(40, len(unique_diseases))

    disease_counts = pd.Series(y_true).value_counts()
    diseases_to_show = disease_counts.head(num_diseases_to_show).index

    cm = confusion_matrix(y_true, y_pred, labels=diseases_to_show)

    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='RdBu', center=0,
                xticklabels=[label_encoder.classes_[i] for i in diseases_to_show],
                yticklabels=[label_encoder.classes_[i] for i in diseases_to_show])
    plt.title(f'{model_name} - Confusion Matrix (Top {num_diseases_to_show} Diseases)')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f'{model_name.replace(' ', '_').lower()}_confusion_matrix.png')
    plt.show()

### Training and Validation Accuracy Comparision (1D-CNN)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

train_acc = history.history['accuracy']
val_acc = history.history.get('val_accuracy')

epochs = np.arange(1, len(train_acc) + 1)
mean_train = np.mean(train_acc)
mean_val = np.mean(val_acc) if val_acc else None

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_xlim(0.5, len(epochs) + 0.5)
ax.set_ylim(0, 1.05)
ax.set_xlabel('Epoch')
ax.set_ylabel('Accuracy')
ax.set_title('Human Vitals 1D-CNN Accuracy')
ax.grid(True)

line_train, = ax.plot([], [], label='Train Accuracy', color='blue')
line_val, = ax.plot([], [], label='Validation Accuracy', color='green') if val_acc else (None,)
mean_line_train = ax.axhline(mean_train, color='blue', linestyle='--', alpha=0.5, label='Train Mean')
mean_line_val = ax.axhline(mean_val, color='green', linestyle='--', alpha=0.5, label='Val Mean') if val_acc else None

ax.legend()

def init():
    line_train.set_data([], [])
    if val_acc:
        line_val.set_data([], [])
    return (line_train, line_val) if val_acc else (line_train,)

def update(frame):
    line_train.set_data(epochs[:frame + 1], train_acc[:frame + 1])
    if val_acc:
        line_val.set_data(epochs[:frame + 1], val_acc[:frame + 1])
        return line_train, line_val
    return line_train,

ani = FuncAnimation(fig, update, frames=len(epochs), init_func=init, blit=True, interval=100)

ani.save('cnn_accuracy_animation.gif', writer='pillow')

plt.close()
HTML(ani.to_jshtml())

# Testing ML Models


In [None]:
testing_data = cleaned_data.sample(15)
test_sym_map = {}

for index, row in testing_data.iterrows():
  sym_list = []
  for column in testing_data.columns:
    if row[column] == 1 and column != 'diseases':
      sym_list.append(column)

  disease_name = row['diseases']
  test_sym_map[disease_name] = sym_list

In [None]:
lr_testing = joblib.load('logistic_regression_model.joblib')
knn_testing = joblib.load('knn_model.joblib')
dt_testing = joblib.load('decision_tree_model.joblib')
cnn_testing = tf.keras.models.load_model(os.path.join('cnn_model.h5'))

In [None]:
encoded_disease = pd.read_csv('disease_mapping.csv')

### Testing Logistic Regression


In [None]:
actual_labels = []
predicted_labels = []

print('Logistics Regression Testing: ')

for i in range(len(testing_data)):
  disease_name = testing_data.iloc[i]['diseases']

  pred_series = testing_data.iloc[i].drop('diseases')
  pred_df = pd.DataFrame([pred_series])

  predictions = lr_testing.predict(pred_df)

  predicted_class_index = predictions[0]
  predicted_disease_name = encoded_disease.loc[encoded_disease['Encoded'] == predicted_class_index]['Disease'].iloc[0]

  actual_labels.append(disease_name)
  predicted_labels.append(predicted_disease_name)

  confidence_scores = lr_testing.predict_proba(pred_df)
  confidence_of_predicted_class = confidence_scores[0][predicted_class_index]
  print(f'Actual Disease: {disease_name}, Predicted Disease: {predicted_disease_name}, Confidence: {confidence_of_predicted_class:.4f}')
  print('=============================================================================================================================')

In [None]:
lr_pred = lr_testing.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)
lr_f1_score = f1_score(y_test, lr_pred, average='weighted')
print(f'Logistic Regression Accuracy: {lr_accuracy:.4f} | F1 Score: {lr_f1_score:.4f}')

In [None]:
plot_simplified_cm(y_test, lr_pred, "Logistic Regression")

### Testing K-Nearest Neighbors


In [None]:
actual_labels = []
predicted_labels = []

print('K-Nearest Neighbors Testing: ')

for i in range(len(testing_data)):
  disease_name = testing_data.iloc[i]['diseases']

  pred_series = testing_data.iloc[i].drop('diseases')
  pred_df = pd.DataFrame([pred_series])

  predictions = knn_testing.predict(pred_df)

  predicted_class_index = predictions[0]
  predicted_disease_name = encoded_disease.loc[encoded_disease['Encoded'] == predicted_class_index]['Disease'].iloc[0]

  actual_labels.append(disease_name)
  predicted_labels.append(predicted_disease_name)

  confidence_scores = lr_testing.predict_proba(pred_df)
  confidence_of_predicted_class = confidence_scores[0][predicted_class_index]
  print(f'Actual Disease: {disease_name}, Predicted Disease: {predicted_disease_name}, Confidence: {confidence_of_predicted_class:.4f}')
  print('=============================================================================================================================')

In [None]:
knn_pred = knn_testing.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_f1_score = f1_score(y_test, knn_pred, average='weighted')
print(f'K-Nearest Neighbors Classifier Accuracy: {knn_accuracy:.4f} | F1 Score: {knn_f1_score:.4f}')

In [None]:
plot_simplified_cm(y_test, knn_pred, "K Nearest Neighbor")

### Testing Decision Tree


In [None]:
actual_labels = []
predicted_labels = []

print('K Nearest Neighbors Testing: ')

for i in range(len(testing_data)):
  disease_name = testing_data.iloc[i]['diseases']

  pred_series = testing_data.iloc[i].drop('diseases')
  pred_df = pd.DataFrame([pred_series])

  predictions = dt_testing.predict(pred_df)

  predicted_class_index = predictions[0]
  predicted_disease_name = encoded_disease.loc[encoded_disease['Encoded'] == predicted_class_index]['Disease'].iloc[0]

  actual_labels.append(disease_name)
  predicted_labels.append(predicted_disease_name)

  confidence_scores = lr_testing.predict_proba(pred_df)
  confidence_of_predicted_class = confidence_scores[0][predicted_class_index]
  print(f'Actual Disease: {disease_name}, Predicted Disease: {predicted_disease_name}, Confidence: {confidence_of_predicted_class:.4f}')
  print('=============================================================================================================================')

In [None]:
dt_pred = dt_testing.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)
dt_f1_score = f1_score(y_test, dt_pred, average='weighted')
print(f'Decision Tree Classifier Accuracy: {dt_accuracy:.4f} | F1 Score: {dt_f1_score:.4f}')

In [None]:
plot_simplified_cm(y_test, dt_pred, "Decision Tree")

### Testing 1D-CNN Model


In [None]:
actual_labels = []
predicted_labels = []

print('1D CNN Testing: ')

for i in range(len(testing_data)):
    disease_name = testing_data.iloc[i]['diseases']

    pred_series = testing_data.iloc[i].drop('diseases').values
    pred_array = np.expand_dims(pred_series, axis=0)

    predictions = cnn_model.predict(pred_array)

    predicted_class_index = np.argmax(predictions[0])
    predicted_disease_name = encoded_disease.loc[encoded_disease['Encoded'] == predicted_class_index]['Disease'].iloc[0]

    actual_labels.append(disease_name)
    predicted_labels.append(predicted_disease_name)

    confidence_of_predicted_class = predictions[0][predicted_class_index]
    print(f'Actual Disease: {disease_name}, Predicted Disease: {predicted_disease_name}, Confidence: {confidence_of_predicted_class:.4f}')
    print('=============================================================================================================================')

In [None]:
cnn_pred = dt_testing.predict(X_test)
cnn_accuracy = accuracy_score(y_test, cnn_pred)
cnn_f1_score = f1_score(y_test, cnn_pred, average='weighted')
print(f'Decision Tree Classifier Accuracy: {cnn_accuracy:.4f} | F1 Score: {cnn_f1_score:.4f}')

In [None]:
plot_simplified_cm(y_test, dt_pred, "1D-CNN")

# Evaluating Best Model


In [None]:
accuracies = {
    'Logistic Regression': lr_accuracy,
    'Decision Tree': dt_accuracy,
    'K-Nearest Neighbors': knn_accuracy,
    '1D-CNN': cnn_accuracy
}

accuracy_df = pd.DataFrame(list(accuracies.items()), columns=['Model', 'Accuracy'])
accuracy_df = accuracy_df.sort_values(by='Accuracy', ascending=False)

plt.figure(figsize=(10, 6))
ax = sns.barplot(x='Accuracy', y='Model', data=accuracy_df, palette='viridis')

for p in ax.patches:
    width = p.get_width()
    plt.text(width + 0.02, p.get_y() + p.get_height()/2., '{:1.2f}%'.format(width * 100), ha="left", va="center")

plt.title('Model Accuracy Comparison', fontsize=16)
plt.xlabel('Accuracy (%)', fontsize=12)
plt.ylabel('Model', fontsize=12)
plt.xlim(0, 1.1)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('model_accuracy_comparison.png')
plt.show()