In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
df = pd.read_csv('/kaggle/input/disease-symptom-description-dataset/dataset.csv')
data_severity = pd.read_csv('/kaggle/input/disease-symptom-description-dataset/Symptom-severity.csv')

In [None]:
df.head(2)

In [None]:
data_severity.head()

In [None]:
for i in data_severity.index:
    print(data_severity['Symptom'][i], data_severity['weight'][i])

In [None]:
#convert data_severity to dictionnary
data_dict = data_severity.set_index('Symptom').T.to_dict()

In [None]:
data_dict

In [None]:
df.shape

In [None]:
df.info()

In [None]:
def remove_space_between_words(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip().str.replace(" ", "_")
    return df

In [None]:
df = remove_space_between_words(df)
df.head()

In [None]:
df[df['Disease']=='Acne'].values

In [None]:
def encode_symptoms(df, data_severity):
    for i in data_severity.index:
        symptom = data_severity["Symptom"][i]
        weight = data_severity["weight"][i]
        df = df.replace(symptom, weight)

    # Replace missing values with 0
    df = df.fillna(0)

    # Additional hardcoded replacements
    df = df.replace("foul_smell_of_urine", 5)
    df = df.replace("dischromic__patches", 6)
    df = df.replace("spotting__urination", 6)
    
    return df

In [None]:
new_df = encode_symptoms(df, data_severity)

In [None]:
new_df.head()

In [None]:
names = []

# Iterate through columns except for "Disease"
for col in new_df.columns:
    if col != "Disease":
        # Iterate through rows in the column
        for symptom in new_df[col]:
            # Check if the value is a string and not in the 'names' list
            if isinstance(symptom, str) and symptom not in names:
                names.append(symptom)

# Check if all symptoms have been replaced
all_replaced = all(symptom not in names for symptom in data_severity["Symptom"])

if all_replaced:
    print("All symptoms have been replaced.")
else:
    print("The following symptoms were not replaced:", names)


In [None]:
# separating the data and labels
X = new_df.drop(columns='Disease', axis=1)
Y = new_df['Disease']

In [None]:
print(X)

In [None]:
print(Y)

# Data Standardization

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X)

In [None]:
standardized_data = scaler.transform(X)

In [None]:
print(standardized_data)

In [None]:
X = standardized_data
Y = new_df['Disease']

In [None]:
print(X)

In [None]:
print(Y)

#  Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier
rfc_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
rfc_classifier.fit(X_train, Y_train)

# Predict disease labels on the testing data
Y_pred = rfc_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)

precision = precision_score(Y_test, Y_pred, average='macro')
recall = recall_score(Y_test, Y_pred,average='macro')
f1 = f1_score(Y_test, Y_pred, average='macro')

print("Random Forest")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(accuracy_score(Y_test, Y_pred,normalize=False))
print("Confusion matrix")
conf_matrix=confusion_matrix(Y_test,Y_pred)
print(conf_matrix)

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Create a Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data
dt_classifier.fit(X_train, Y_train)

# Predict disease labels on the testing data
Y_pred = dt_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)

precision = precision_score(Y_test, Y_pred, average='macro')
recall = recall_score(Y_test, Y_pred,average='macro')
f1 = f1_score(Y_test, Y_pred, average='macro')

print("Decision Tree")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(accuracy_score(Y_test, Y_pred,normalize=False))
print("Confusion matrix")
conf_matrix=confusion_matrix(Y_test,Y_pred)
print(conf_matrix)

# Gaussian Naive Bayes classifier

In [None]:
from sklearn.naive_bayes import GaussianNB

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Create a Decision Tree classifier
NB_classifier = GaussianNB()

# Fit the model on the training data
NB_classifier.fit(X_train, Y_train)

# Predict disease labels on the testing data
Y_pred = NB_classifier.predict(X_test)

precision = precision_score(Y_test, Y_pred, average='macro')
recall = recall_score(Y_test, Y_pred,average='macro')
f1 = f1_score(Y_test, Y_pred, average='macro')

print("Gaussian NB")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(accuracy_score(Y_test, Y_pred,normalize=False))
print("Confusion matrix")
conf_matrix=confusion_matrix(Y_test,Y_pred)
print(conf_matrix)

# K-Nearest Neighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Split the data into training and testing sets with a random state
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
   
# Create a k-NN classifier and specify the number of neighbors e.g. 3  
k=3
knn_classifier = KNeighborsClassifier(n_neighbors=k)

# Fit the model on the training data
knn_classifier.fit(X_train, Y_train)

# Predict the labels for the test data
Y_pred = knn_classifier.predict(X_test)

precision = precision_score(Y_test, Y_pred, average='macro')
recall = recall_score(Y_test, Y_pred,average='macro')
f1 = f1_score(Y_test, Y_pred, average='macro')

print("KNN")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(accuracy_score(Y_test, Y_pred,normalize=False))
print("Confusion matrix")
conf_matrix=confusion_matrix(Y_test,Y_pred)
print(conf_matrix)

# K-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC  

svm_classifier = SVC()

# Specify the number of folds for cross-validation
num_folds = 10  # You can choose any number of folds you prefer

# Create a K-Fold cross-validation object
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform cross-validation and compute scores using accuracy, precision, recall
scores = cross_val_score(svm_classifier, X, Y, cv=kf, scoring='accuracy')

# Print the cross-validation scores
print(f'Cross-Validation Scores: {scores}')

# Calculate and print the mean and standard deviation of the scores  
# High mean accuracy and a relatively low standard deviation in cross-validation scores are good indicators of consistent and reliable model performance across different subsets of the data.
mean_score = scores.mean()
std_deviation = scores.std()
precision = precision_score(Y_test, Y_pred, average='macro')
recall = recall_score(Y_test, Y_pred,average='macro')
f1 = f1_score(Y_test, Y_pred, average='macro')
print(f'Mean Accuracy: {mean_score}')
print(f'Standard Deviation: {std_deviation}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')


# SVM

In [None]:
from sklearn.svm import SVC

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=100)

# Create an instance of the SVM classifier
svm_classifier = SVC(kernel='linear')

# Train the model using the training data
svm_classifier.fit(X_train, Y_train)

# Predict disease labels on the testing data
Y_pred = svm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)

precision = precision_score(Y_test, Y_pred, average='macro')
recall = recall_score(Y_test, Y_pred,average='macro')
f1 = f1_score(Y_test, Y_pred, average='macro')

print("SVM")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(accuracy_score(Y_test, Y_pred,normalize=False))
print("Confusion matrix")
conf_matrix=confusion_matrix(Y_test,Y_pred)
print(conf_matrix)