In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Loading the dataset
df = pd.read_csv('output.csv')

# Split the dataset into features (X) and target (y)
X = df.drop('Disease', axis=1)
y = df['Disease']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier with multiple Decision Trees (n_estimators)
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred) * 100
print(f'Accuracy: {accuracy:.2f}')

# checking which diseases are predicted wrong 
# for actual, predicted in zip(y_test, y_pred):
#     if actual != predicted:
#         print(f"Actual: {actual}, Predicted: {predicted}")

Accuracy: 96.83


In [2]:
# Initialize the RandomForestClassifier with multiple Decision Trees (n_estimators)
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred) * 100
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 96.83


In [3]:
import pickle

# Save the 'clf' model to a pickle file
with open('model.pkl', 'wb') as file:
    pickle.dump(clf, file)

In [4]:
from sklearn.preprocessing import LabelEncoder

# Assuming your target variable is 'y'
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Print unique values before and after encoding
print("Unique values in y:", set(y))
print("Unique values in y_encoded:", set(y_encoded))

Unique values in y: {'Obesity Morbid', "Alzheimer'S Disease", 'Hiv Infections', 'Thrombus', 'Hypertensive Disease', 'Edema Pulmonary', 'Psoriasis', 'Sepsis (Invertebrate)', 'AIDS', 'Benign Prostatic Hypertrophy', 'Stenosis Aortic Valve', 'Alcoholic hepatitis', 'Hyperbilirubinemia', 'Gastroenteritis', 'Infection Urinary Tract', 'Carcinoma Breast', 'Ulcer Peptic', 'Pericardial Effusion Body Substance', 'Cellulitis', 'Ketoacidosis Diabetic', 'Spasm Bronchial', 'Dependence', 'Schizophrenia', 'Jaundice', 'Cardiomyopathy', 'Transient Ischemic Attack', 'Tricuspid Valve Insufficiency', 'Impetigo', 'Peripheral Vascular Disease', 'Paralysis (brain hemorrhage)', 'Hepatitis E', 'Dehydration', 'Kidney Failure Acute', 'Tuberculosis', 'Arthritis', 'Cirrhosis', 'Osteoporosis', 'Suicide Attempt', 'Delusion', 'Fungal infection', 'Decubitus Ulcer', 'Dimorphic hemmorhoids(piles)', 'Hypercholesterolemia', 'Bronchitis', 'Malignantneoplasms', 'Hepatitis', 'Obesity', 'Heart attack', 'Psychotic Disorder', 'Kid

In [5]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create an SVM classifier
model = make_pipeline(StandardScaler(), SVC())

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"SVM Accuracy: {accuracy:.2f}")

SVM Accuracy: 96.83


In [6]:
from sklearn.neighbors import KNeighborsClassifier

# Create a KNN classifier
model = KNeighborsClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"KNN Accuracy: {accuracy:.2f}")


KNN Accuracy: 96.83


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [7]:
from sklearn.naive_bayes import GaussianNB

# Create a Naive Bayes classifier
model = GaussianNB()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Naive Bayes Accuracy: {accuracy:.2f}")


Naive Bayes Accuracy: 96.83


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# AdaBoost Classifier
base_classifier = DecisionTreeClassifier(max_depth=1)  # You can use any classifier as the base learner
adaboost_classifier = AdaBoostClassifier(base_classifier, n_estimators=50, learning_rate=1.0, random_state=42)

# Support Vector Machine (SVM) Classifier
svm_classifier = SVC(probability=True, kernel='linear', C=1.0)

# K-Nearest Neighbors (KNN) Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Voting Classifier
voting_classifier = VotingClassifier(
    estimators=[
        ('adaboost', adaboost_classifier),
        ('svm', svm_classifier),
        ('knn', knn_classifier)
    ],
    voting='soft'  # 'hard' for majority voting, 'soft' for weighted voting based on probabilities
)

# Train the classifiers
adaboost_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)
knn_classifier.fit(X_train, y_train)
voting_classifier.fit(X_train, y_train)

# Make predictions
adaboost_pred = adaboost_classifier.predict(X_test)
svm_pred = svm_classifier.predict(X_test)
knn_pred = knn_classifier.predict(X_test)
voting_pred = voting_classifier.predict(X_test)

# Evaluate the performance
print(f"AdaBoost Accuracy: {accuracy_score(y_test, adaboost_pred)}")
print(f"SVM Accuracy: {accuracy_score(y_test, svm_pred)}")
print(f"KNN Accuracy: {accuracy_score(y_test, knn_pred)}")
print(f"Voting Classifier Accuracy: {accuracy_score(y_test, voting_pred)}")


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


AdaBoost Accuracy: 0.1671612265084075
SVM Accuracy: 0.9683481701285855
KNN Accuracy: 0.9683481701285855
Voting Classifier Accuracy: 0.9683481701285855


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Decision Tree Classifier
base_classifier_dt = DecisionTreeClassifier(max_depth=5)

# Support Vector Machine (SVM) Classifier
base_classifier_svm = SVC(probability=True, kernel='linear', C=1.0)

# K-Nearest Neighbors (KNN) Classifier
base_classifier_knn = KNeighborsClassifier(n_neighbors=5)

# Random Forest Classifier
base_classifier_rf = RandomForestClassifier(n_estimators=150, random_state=42)

# Voting Classifier
voting_classifier = VotingClassifier(
    estimators=[
        ('decision_tree', base_classifier_dt),
        ('svm', base_classifier_svm),
        ('knn', base_classifier_knn),
        ('random_forest', base_classifier_rf)
    ],
    voting='soft'  # 'hard' for majority voting, 'soft' for weighted voting based on probabilities
)

# Train the classifiers
base_classifier_dt.fit(X_train, y_train)
base_classifier_svm.fit(X_train, y_train)
base_classifier_knn.fit(X_train, y_train)
base_classifier_rf.fit(X_train, y_train)
voting_classifier.fit(X_train, y_train)

# Make predictions
dt_pred = base_classifier_dt.predict(X_test)
svm_pred = base_classifier_svm.predict(X_test)
knn_pred = base_classifier_knn.predict(X_test)
rf_pred = base_classifier_rf.predict(X_test)
voting_pred = voting_classifier.predict(X_test)

# Evaluate the performance
print(f"Decision Tree Accuracy: {accuracy_score(y_test, dt_pred)}")
print(f"SVM Accuracy: {accuracy_score(y_test, svm_pred)}")
print(f"KNN Accuracy: {accuracy_score(y_test, knn_pred)}")
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_pred)}")
print(f"Voting Classifier Accuracy: {accuracy_score(y_test, voting_pred)}")


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Decision Tree Accuracy: 0.09792284866468842
SVM Accuracy: 0.9683481701285855
KNN Accuracy: 0.9683481701285855
Random Forest Accuracy: 0.9683481701285855
Voting Classifier Accuracy: 0.9683481701285855


In [10]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Decision Tree Classifier
base_classifier_dt = DecisionTreeClassifier(max_depth=5)
base_classifier_dt.fit(X_train, y_train)
dt_pred = base_classifier_dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)

# Support Vector Machine (SVM) Classifier
base_classifier_svm = SVC(probability=True, kernel='linear', C=1.0)
base_classifier_svm.fit(X_train, y_train)
svm_pred = base_classifier_svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)
# RandomForest Classifier
rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42)
rf_classifier.fit(X_train, y_train)
rf_pred = rf_classifier.predict(X_test)
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_pred)}")

# GradientBoosting Classifier
gb_classifier = GradientBoostingClassifier(n_estimators=20, learning_rate=1.0, max_depth=1, random_state=42)
gb_classifier.fit(X_train, y_train)
gb_pred = gb_classifier.predict(X_test)
print(f"Gradient Boosting Accuracy: {accuracy_score(y_test, gb_pred)}")

# SVM Classifier
svm_classifier = SVC(probability=True, kernel='linear', C=1.0)
svm_classifier.fit(X_train, y_train)
svm_pred = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)

# K-Nearest Neighbors (KNN) Classifier
base_classifier_knn = KNeighborsClassifier(n_neighbors=5)
base_classifier_knn.fit(X_train, y_train)
knn_pred = base_classifier_knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)

# Random Forest Classifier
base_classifier_rf = RandomForestClassifier(n_estimators=50, random_state=42)
base_classifier_rf.fit(X_train, y_train)
rf_pred = base_classifier_rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

# AdaBoost Classifier
base_classifier_ada = AdaBoostClassifier(n_estimators=50, learning_rate=1)
base_classifier_ada.fit(X_train, y_train)
ada_pred = base_classifier_ada.predict(X_test)
ada_accuracy = accuracy_score(y_test, ada_pred)

# Voting Classifier
voting_classifier = VotingClassifier(
    estimators=[
        ('decision_tree', base_classifier_dt),
        ('svm', base_classifier_svm),
        ('knn', base_classifier_knn),
        ('random_forest', base_classifier_rf),
        ('ada_boost', base_classifier_ada)
    ],
    voting='soft'  # 'hard' for majority voting, 'soft' for weighted voting based on probabilities
)

voting_classifier.fit(X_train, y_train)
voting_pred = voting_classifier.predict(X_test)
voting_accuracy = accuracy_score(y_test, voting_pred)

# Print accuracies together
print(f"Decision Tree Accuracy: {dt_accuracy}")
print(f"SVM Accuracy: {svm_accuracy}")
print(f"KNN Accuracy: {knn_accuracy}")
print(f"Random Forest Accuracy: {rf_accuracy}")
print(f"AdaBoost Accuracy: {ada_accuracy}")
print(f"Voting Classifier Accuracy: {voting_accuracy}")


Random Forest Accuracy: 0.9683481701285855
Gradient Boosting Accuracy: 0.1493570722057369


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Decision Tree Accuracy: 0.09792284866468842
SVM Accuracy: 0.9683481701285855
KNN Accuracy: 0.9683481701285855
Random Forest Accuracy: 0.9683481701285855
AdaBoost Accuracy: 0.1671612265084075
Voting Classifier Accuracy: 0.9683481701285855


In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

voting_accuracy = accuracy_score(y_test, y_pred)
print(f"Voting Classifier Accuracy: {voting_accuracy}")

voting_precision = precision_score(y_test, y_pred, average='weighted')
voting_recall = recall_score(y_test, y_pred, average='weighted')
voting_f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Voting Classifier Precision: {voting_precision}")
print(f"Voting Classifier Recall: {voting_recall}")
print(f"Voting Classifier F1 Score: {voting_f1}")

Voting Classifier Accuracy: 0.9683481701285855
Voting Classifier Precision: 0.9516013034728258
Voting Classifier Recall: 0.9683481701285855
Voting Classifier F1 Score: 0.9583380624599073


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
