### Importing libraries 

In [1]:
# Importing libraries
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
from scipy.stats import mode
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# from flask import Flask

# Route for seeing a data

%matplotlib inline


### Importing necessary datasets for training 

In [2]:
train_dataset = pd.read_csv('../FYP_MachineLearning_SET/Dataset_NEW/dataset.csv')
print(train_dataset.head())

print ("="*30)

severity_dataset = pd.read_csv('../FYP_MachineLearning_SET/Dataset_NEW/Symptom-severity.csv')
print(severity_dataset.head())

                                   Disease  Symptom_1  Symptom_2  \
0  (vertigo) Paroymsal  Positional Vertigo   vomiting   headache   
1  (vertigo) Paroymsal  Positional Vertigo   vomiting   headache   
2  (vertigo) Paroymsal  Positional Vertigo   headache     nausea   
3  (vertigo) Paroymsal  Positional Vertigo   vomiting     nausea   
4  (vertigo) Paroymsal  Positional Vertigo   vomiting   headache   

             Symptom_3            Symptom_4         Symptom_5      Symptom_6  \
0               nausea   spinning_movements   loss_of_balance   unsteadiness   
1               nausea   spinning_movements   loss_of_balance   unsteadiness   
2   spinning_movements      loss_of_balance      unsteadiness            NaN   
3   spinning_movements      loss_of_balance      unsteadiness            NaN   
4   spinning_movements      loss_of_balance      unsteadiness            NaN   

  Symptom_7 Symptom_8 Symptom_9 Symptom_10 Symptom_11 Symptom_12 Symptom_13  \
0       NaN       NaN       NaN

### Cleaning of data

In [3]:
cols = train_dataset.columns
data = train_dataset[cols].values.flatten()
# data

string = pd.Series(data)
# print (str[0])
string = string.str.strip()
string = string.values.reshape(train_dataset.shape)

train_dataset = pd.DataFrame(string, columns=train_dataset.columns)

train_dataset = train_dataset.fillna(0)
train_dataset.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,0,0,0,0,0,0,0,0,0,0,0
1,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,0,0,0,0,0,0,0,0,0,0,0
2,(vertigo) Paroymsal Positional Vertigo,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,0,0,0,0,0,0,0,0,0,0,0,0
3,(vertigo) Paroymsal Positional Vertigo,vomiting,nausea,spinning_movements,loss_of_balance,unsteadiness,0,0,0,0,0,0,0,0,0,0,0,0
4,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,spinning_movements,loss_of_balance,unsteadiness,0,0,0,0,0,0,0,0,0,0,0,0


### Encoding the the symptoms with their severity weight

In [4]:
vals = train_dataset.values
symptoms = severity_dataset['Symptom'].unique()
# len(symptoms)

for i in range(len(symptoms)):
    vals[vals == symptoms[i]] = severity_dataset[severity_dataset['Symptom'] == symptoms[i]]['weight'].values[0]
    
se_data = pd.DataFrame(vals, columns=cols)
train_dataset

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,(vertigo) Paroymsal Positional Vertigo,5,3,5,6,4,4,0,0,0,0,0,0,0,0,0,0,0
1,(vertigo) Paroymsal Positional Vertigo,5,3,5,6,4,4,0,0,0,0,0,0,0,0,0,0,0
2,(vertigo) Paroymsal Positional Vertigo,3,5,6,4,4,0,0,0,0,0,0,0,0,0,0,0,0
3,(vertigo) Paroymsal Positional Vertigo,5,5,6,4,4,0,0,0,0,0,0,0,0,0,0,0,0
4,(vertigo) Paroymsal Positional Vertigo,5,3,6,4,4,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,Varicose veins,4,4,4,4,5,5,6,0,0,0,0,0,0,0,0,0,0
4916,Varicose veins,4,4,4,4,5,5,6,0,0,0,0,0,0,0,0,0,0
4917,Varicose veins,4,4,4,4,5,5,6,0,0,0,0,0,0,0,0,0,0
4918,Varicose veins,4,4,4,4,5,5,6,0,0,0,0,0,0,0,0,0,0


### Storing the diseases and encoded symptoms in seperate dataframes 

In [5]:
(train_dataset[cols] == 0).all()

train_dataset['Disease'].value_counts() #To check whether the dataset is balance or not

train_dataset['Disease'].unique()

train_data = train_dataset.iloc[:,1:].values
# train_data
train_labels = train_dataset['Disease'].values
# train_labels

In [6]:
# Defining scoring metric for k-fold cross validation
def score_CV(estimator, X, y):
    return accuracy_score(y, estimator.predict(X))

### Cross-validation for check the best model 

In [7]:
# Initializing Models
models = {
    "Random Forest":RandomForestClassifier(n_estimators=20, random_state=20),
    "Gaussian NB":GaussianNB(),
    "KNN":KNeighborsClassifier(n_neighbors = 1),
    "SVC":SVC(kernel='linear', C=1, random_state=0)
}

# Producing cross validation score for the models - Using the whole dataset
for model_name in models:
    model = models[model_name]
    scores = cross_val_score(model, train_data, train_labels, cv = 10, scoring = score_CV, n_jobs = -1)
    print("=="*40)
    print(model_name)
    print(f"Scores: {scores}\n")
    print(f"Mean Score: {np.mean(scores)}")

Random Forest
Scores: [0.98577236 0.98577236 0.9796748  0.98373984 0.98780488 1.
 1.         1.         1.         1.        ]

Mean Score: 0.9922764227642276
Gaussian NB
Scores: [0.82520325 0.79674797 0.82317073 0.81707317 0.81910569 0.95121951
 0.95121951 0.95121951 0.95121951 0.95121951]

Mean Score: 0.883739837398374
KNN
Scores: [0.98780488 0.98577236 0.98170732 0.98373984 0.98780488 1.
 1.         1.         1.         1.        ]

Mean Score: 0.9926829268292682
SVC
Scores: [0.96747967 0.96341463 0.95934959 0.96544715 0.9695122  1.
 1.         1.         1.         1.        ]

Mean Score: 0.9825203252032519


### Training models 

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_data, train_labels, shuffle=True, train_size = 0.90)
# print (y_test)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

In [None]:
# Training the models on whole data
final_rf_model = RandomForestClassifier(n_estimators=20, random_state=20)
# final_nb_model = GaussianNB()
final_svm_model = SVC(probability=True, kernel='linear', C=1, random_state=0)
final_knn_model = KNeighborsClassifier(n_neighbors = 1)

final_rf_model.fit(x_train, y_train)
final_svm_model.fit(x_train, y_train)
final_knn_model.fit(x_train, y_train)

# Combine the models using a voting classifier
ensemble = VotingClassifier(estimators=[("rf", final_rf_model), ("svm", final_svm_model), ("knn", final_knn_model)], voting="soft", weights=[0.4,0.5,0.1])
ensemble.fit(x_train, y_train)

In [None]:
ensemble_pred = ensemble.predict(x_test)

# Evaluate the performance of the ensemble model
accuracy = accuracy_score(y_test, ensemble_pred)
print("Ensemble accuracy: ", accuracy)

In [None]:
accuracy = accuracy_score(y_test, ensemble_pred)
precision = precision_score(y_test, ensemble_pred, average='weighted')
recall = recall_score(y_test, ensemble_pred, average='weighted')
f1 = f1_score(y_test, ensemble_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

In [None]:
# Make predictions on the test data using predict_proba method
ensemble_pred_prob = ensemble.predict_proba(x_test)
print (ensemble_pred_prob)

In [None]:
# Get the class label with the highest probability
ensemble_pred_prob = np.argmax(ensemble_pred_prob, axis=1)

In [None]:
# Making prediction by take mode of predictions
# made by all the classifiers
svm_preds = final_svm_model.predict(x_test)
# nb_preds = final_nb_model.predict(x_test)
knn_preds = final_knn_model.predict(x_test)
rf_preds = final_rf_model.predict(x_test)

### Testing accuracy on each model 

In [None]:
# Testing SVM Classifier
print(f"Accuracy on test data by SVM Classifier\
: {accuracy_score(y_test, svm_preds)*100}")

cf_matrix = confusion_matrix(y_test, svm_preds)
plt.figure(figsize=(12,8))
sns.heatmap(cf_matrix, annot=True)
plt.title("Confusion Matrix for SVM Classifier on Test Data")
plt.show()

# Testing Naive Bayes Classifier
print(f"Accuracy on test data by  K-Nearest Neighbors Classifier\
: {accuracy_score(y_test, knn_preds)*100}")

cf_matrix = confusion_matrix(y_test, knn_preds)
plt.figure(figsize=(12,8))
sns.heatmap(cf_matrix, annot=True)
plt.title("Confusion Matrix for  K-Nearest Neighbors Classifier on Test Data")
plt.show()

# Testing Random Forest Classifier
print(f"Accuracy on test data by Random Forest Classifier\
: {accuracy_score(y_test, rf_preds)*100}")

cf_matrix = confusion_matrix(y_test, rf_preds)
plt.figure(figsize=(12,8))
sns.heatmap(cf_matrix, annot=True)
plt.title("Confusion Matrix for Random Forest Classifier on Test Data")
plt.show()

# Testing Ensembel model
print(f"Accuracy on test data by Ensemble model\
: {accuracy_score(y_test, ensemble_pred)*100}")

cf_matrix = confusion_matrix(y_test, ensemble_pred)
plt.figure(figsize=(12,8))
sns.heatmap(cf_matrix, annot=True)
plt.title("Confusion Matrix for Ensemble model on Test Data")
plt.show()

### Make the final prediction 

In [None]:
print(f"Test dataset Accuracy - Combined model-Ensembled Model\
: {accuracy_score(y_test, ensemble_pred)*100}")

cf_matrix = confusion_matrix(y_test, ensemble_pred)
plt.figure(figsize=(12,8))

sns.heatmap(cf_matrix, annot = True, center=1)

plt.title("Combined model on Test Dataset - Confussion Matrix")
plt.show()

In [None]:
print(classification_report(y_test, ensemble_pred))
# In a medical diagnosis problem, prioritize recall over precision, since it is more important to catch all the cases of a 
# disease

In [None]:
filename = 'symptoms_model.sav'
joblib.dump(ensemble, filename)

In [None]:
import joblib
ensemble = joblib.load('../FYP_MachineLearning_SET/symptoms_model.sav')

def predictDisease():
    
#     psymptoms = ["vomiting", "headache", "nausea", "spinning_movements", "loss_of_balance", "unsteadiness", "pus_filled_pimples"]
    psymptoms = ["skin_rash", "high_fever", "blister", "red_sore_around_nose", "yellow_crust_ooze"]
#     psymptoms = ["fatigue", "cough", "high_fever", "breathlessness", "sweating", "malaise", "chest_pain", "fast_heart_rate", "rusty_sputum"]
#     psymptoms = ["fatigue", "skin_rash", "dischromic_patches", "yellow_crust_ooze", "sweating", "malaise", "itching", "fast_heart_rate", "rusty_sputum"]
    
    
    get_symptom = np.array(severity_dataset["Symptom"])
    get_weight = np.array(severity_dataset["weight"])
    for j in range(len(psymptoms)):
        for k in range(len(get_symptom)):
            if psymptoms[j]==get_symptom[k]:
                psymptoms[j]=get_weight[k]

    nulls = [0,0,0,0,0,0,0,0,0,0,0,0]
    psy = [psymptoms + nulls]
    
    # generating individual outputs
    rf_prediction = final_rf_model.predict(psy)[0]
#     nb_prediction = final_nb_model.predict(psy)[0]
    knn_prediction = final_knn_model.predict(psy)[0]
    svm_prediction = final_svm_model.predict(psy)[0]
    ensemble_prediction = ensemble.predict(psy)[0]
    
    y_new_probs = ensemble.predict_proba(psy)[0]
        
    # Reshape the 1D array to a 2D array with a single row
    ensemble_predict_proba = y_new_probs.reshape(1, -1)

    # Sort the prediction probabilities for each sample in descending order
    sorted_proba_indices = np.argsort(ensemble_predict_proba, axis=1)[:, ::-1]
    sorted_probabilities = np.sort(ensemble_predict_proba, axis=1)[:, ::-1]
    
    # Get the top 3 indices of the sorted prediction probabilities
    top3_indices = sorted_proba_indices[:, :3]    
    top3_prob_indices = sorted_probabilities[:, :3]    

    labels = ensemble.classes_
    
    # Get the class labels for the top 3 indices
    top3_labels = [labels[i] for i in top3_indices.flatten()]
    print (top3_labels)
    print (top3_prob_indices)

    predictions = {
        "rf_model_prediction": rf_prediction,
        "knn_model_prediction": knn_prediction,
        "svm_model_prediction": svm_prediction,
        "ensemble_model_prediction": ensemble_prediction,
        "probabilities": top3_prob_indices
#         "y_new_probs": sorted_proba[:, :3],
    }
    
    return predictions

predictDisease()