In [1096]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import accuracy_score

In [1097]:
# Load and preprocess data
main_df = pd.read_csv("DATA/Full_Dataset.csv")

In [1098]:
main_df.head(10)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Dengue,fever,nausea,vomiting,rash,headache,pain_behind_the_eyes,joint_pain,muscle_pain,,,,,,,,,
1,Typhoid Fever,fever,fatigue,headache,nausea,abdominal_pain,rash,constipation,,,,,,,,,,
2,Typhoid Fever,fever,fatigue,headache,nausea,abdominal_pain,rash,diarrhea,,,,,,,,,,
3,Allergy,continuous_sneezing,shivering,chills,watering_from_eyes,,,,,,,,,,,,,
4,Allergy,shivering,chills,watering_from_eyes,,,,,,,,,,,,,,
5,Allergy,continuous_sneezing,chills,watering_from_eyes,,,,,,,,,,,,,,
6,Allergy,continuous_sneezing,shivering,watering_from_eyes,,,,,,,,,,,,,,
7,Allergy,continuous_sneezing,shivering,chills,,,,,,,,,,,,,,
8,Common Cold,continuous_sneezing,chills,fatigue,cough,fever,headache,swelled_lymph_nodes,malaise,phlegm,throat_irritation,redness_of_eyes,sinus_pressure,runny_nose,congestion,chest_pain,loss_of_smell,muscle_pain
9,Common Cold,continuous_sneezing,chills,fatigue,cough,fever,headache,swelled_lymph_nodes,malaise,phlegm,throat_irritation,redness_of_eyes,sinus_pressure,runny_nose,congestion,chest_pain,loss_of_smell,muscle_pain


In [1099]:
main_df.shape

(114, 18)

In [1100]:
main_df.Disease.value_counts()

Disease
Common Cold             10
Fungal infection        10
Drug Reaction           10
Diabetes                10
Gastroenteritis         10
Migraine                10
Cervical spondylosis    10
Chronic cholestasis      9
Peptic ulcer diseae      9
Bronchial Asthma         9
Hypertension             9
Allergy                  5
Typhoid Fever            2
Dengue                   1
Name: count, dtype: int64

In [1101]:
df = main_df.copy()

In [1102]:
df.dropna(axis=1, how='all', inplace=True)

In [1103]:
df.fillna(0, inplace=True)

In [1104]:
class CustomLabelEncoder(LabelEncoder):
    def __init__(self, start = 0):
        self.start = start
        super().__init__()

    def fit_transform(self, y):
        encoded = super().fit_transform(y)
        encoded += self.start
        return encoded

In [1105]:
flattened_series = df['Disease'].astype(str)

In [1106]:
encoder = CustomLabelEncoder(start=200) 

In [1107]:
encoded_values = encoder.fit_transform(flattened_series)

In [1108]:
df['Disease'] = encoded_values

In [1109]:
mapping_data = {'label_encoder' : encoder}

In [1110]:
label_mapping = {k: v for k, v in zip(mapping_data['label_encoder'].classes_, range(200, 200+len(mapping_data['label_encoder'].classes_)))}

In [1111]:
label_mapping

{'Allergy': 200,
 'Bronchial Asthma': 201,
 'Cervical spondylosis': 202,
 'Chronic cholestasis': 203,
 'Common Cold': 204,
 'Dengue': 205,
 'Diabetes ': 206,
 'Drug Reaction': 207,
 'Fungal infection': 208,
 'Gastroenteritis': 209,
 'Hypertension ': 210,
 'Migraine': 211,
 'Peptic ulcer diseae': 212,
 'Typhoid Fever': 213}

In [1112]:
df = main_df.copy()

In [1113]:
df

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Dengue,fever,nausea,vomiting,rash,headache,pain_behind_the_eyes,joint_pain,muscle_pain,,,,,,,,,
1,Typhoid Fever,fever,fatigue,headache,nausea,abdominal_pain,rash,constipation,,,,,,,,,,
2,Typhoid Fever,fever,fatigue,headache,nausea,abdominal_pain,rash,diarrhea,,,,,,,,,,
3,Allergy,continuous_sneezing,shivering,chills,watering_from_eyes,,,,,,,,,,,,,
4,Allergy,shivering,chills,watering_from_eyes,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,Cervical spondylosis,back_pain,weakness_in_limbs,neck_pain,loss_of_balance,,,,,,,,,,,,,
110,Cervical spondylosis,back_pain,weakness_in_limbs,neck_pain,dizziness,,,,,,,,,,,,,
111,Cervical spondylosis,back_pain,weakness_in_limbs,neck_pain,dizziness,loss_of_balance,,,,,,,,,,,,
112,Cervical spondylosis,weakness_in_limbs,neck_pain,dizziness,loss_of_balance,,,,,,,,,,,,,


In [1114]:
df['All Symptoms'] = df.apply(lambda row: ','.join(row.dropna()), axis=1)
df['All Symptoms'] = df['All Symptoms'].apply(lambda x: ','.join(sorted(set(x.split(','))) if x else ''))

In [1115]:
df['All Symptoms']

0      Dengue,fever,headache,joint_pain,muscle_pain,n...
1      Typhoid Fever,abdominal_pain,constipation,fati...
2      Typhoid Fever,abdominal_pain,diarrhea,fatigue,...
3       chills, continuous_sneezing, shivering, water...
4          chills, shivering, watering_from_eyes,Allergy
                             ...                        
109     back_pain, loss_of_balance, neck_pain, weakne...
110     back_pain, dizziness, neck_pain, weakness_in_...
111     back_pain, dizziness, loss_of_balance, neck_p...
112     dizziness, loss_of_balance, neck_pain, weakne...
113     back_pain, dizziness, loss_of_balance, neck_p...
Name: All Symptoms, Length: 114, dtype: object

In [1116]:
stay_cols= ['Disease', 'All Symptoms']
stay_cols

['Disease', 'All Symptoms']

In [1117]:
df = df[stay_cols]

In [1118]:
df.head()

Unnamed: 0,Disease,All Symptoms
0,Dengue,"Dengue,fever,headache,joint_pain,muscle_pain,n..."
1,Typhoid Fever,"Typhoid Fever,abdominal_pain,constipation,fati..."
2,Typhoid Fever,"Typhoid Fever,abdominal_pain,diarrhea,fatigue,..."
3,Allergy,"chills, continuous_sneezing, shivering, water..."
4,Allergy,"chills, shivering, watering_from_eyes,Allergy"


In [1119]:
df['All Symptoms'][0]

'Dengue,fever,headache,joint_pain,muscle_pain,nausea,pain_behind_the_eyes,rash,vomiting'

In [1120]:
import re

In [1121]:
def strip_to_basic_tokens(text):
    # Remove doble spaces and underscores
    text = re.sub(r'[_\s]+', ' ', text)
    # Split by commas and lowercase the tokens
    tokens = [token.strip().lower() for token in text.split(',')]
    return tokens

In [1122]:
df['Basic Tokens'] = df['All Symptoms'].apply(strip_to_basic_tokens)
df['Basic Tokens'] = df['Basic Tokens'].apply(lambda x: ', '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Basic Tokens'] = df['All Symptoms'].apply(strip_to_basic_tokens)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Basic Tokens'] = df['Basic Tokens'].apply(lambda x: ', '.join(x))


In [1123]:
df.head()

Unnamed: 0,Disease,All Symptoms,Basic Tokens
0,Dengue,"Dengue,fever,headache,joint_pain,muscle_pain,n...","dengue, fever, headache, joint pain, muscle pa..."
1,Typhoid Fever,"Typhoid Fever,abdominal_pain,constipation,fati...","typhoid fever, abdominal pain, constipation, f..."
2,Typhoid Fever,"Typhoid Fever,abdominal_pain,diarrhea,fatigue,...","typhoid fever, abdominal pain, diarrhea, fatig..."
3,Allergy,"chills, continuous_sneezing, shivering, water...","chills, continuous sneezing, shivering, wateri..."
4,Allergy,"chills, shivering, watering_from_eyes,Allergy","chills, shivering, watering from eyes, allergy"


In [1124]:
df = df.drop(['All Symptoms'], axis = 1)

In [1125]:
df.head()

Unnamed: 0,Disease,Basic Tokens
0,Dengue,"dengue, fever, headache, joint pain, muscle pa..."
1,Typhoid Fever,"typhoid fever, abdominal pain, constipation, f..."
2,Typhoid Fever,"typhoid fever, abdominal pain, diarrhea, fatig..."
3,Allergy,"chills, continuous sneezing, shivering, wateri..."
4,Allergy,"chills, shivering, watering from eyes, allergy"


In [1126]:
df['Basic Tokens'][0]

'dengue, fever, headache, joint pain, muscle pain, nausea, pain behind the eyes, rash, vomiting'

In [1127]:
dfE = df.copy() # Taking a copy because we never know what might happen
dfE['Basic Tokens'] = dfE['Basic Tokens'].apply(lambda x: x.split(', '))

In [1128]:
dfE['Basic Tokens'].head()

0    [dengue, fever, headache, joint pain, muscle p...
1    [typhoid fever, abdominal pain, constipation, ...
2    [typhoid fever, abdominal pain, diarrhea, fati...
3    [chills, continuous sneezing, shivering, water...
4     [chills, shivering, watering from eyes, allergy]
Name: Basic Tokens, dtype: object

In [1129]:
mlb = MultiLabelBinarizer()
# Fit and transform the 'Basic Tokens' column
one_hot_encoded = pd.DataFrame(mlb.fit_transform(dfE['Basic Tokens']), columns=mlb.classes_, index=df.index)

In [1130]:
# Concatenate the one-hot encoded DataFrame with the original DataFrame
df_encoded = pd.concat([dfE, one_hot_encoded], axis=1)

In [1131]:
df_encoded

Unnamed: 0,Disease,Basic Tokens,abdominal pain,acidity,allergy,back pain,blurred and distorted vision,breathlessness,bronchial asthma,burning micturition,...,swelled lymph nodes,throat irritation,typhoid fever,visual disturbances,vomiting,watering from eyes,weakness in limbs,weight loss,yellowing of eyes,yellowish skin
0,Dengue,"[dengue, fever, headache, joint pain, muscle p...",0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,Typhoid Fever,"[typhoid fever, abdominal pain, constipation, ...",1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,Typhoid Fever,"[typhoid fever, abdominal pain, diarrhea, fati...",1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,Allergy,"[chills, continuous sneezing, shivering, water...",0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,Allergy,"[chills, shivering, watering from eyes, allergy]",0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,Cervical spondylosis,"[back pain, loss of balance, neck pain, weakne...",0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
110,Cervical spondylosis,"[back pain, dizziness, neck pain, weakness in ...",0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
111,Cervical spondylosis,"[back pain, dizziness, loss of balance, neck p...",0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
112,Cervical spondylosis,"[dizziness, loss of balance, neck pain, weakne...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [1132]:
# Drop the 'Basic Tokens' column
df_encoded = df_encoded.drop(columns=['Basic Tokens'])
df_encoded.head()

Unnamed: 0,Disease,abdominal pain,acidity,allergy,back pain,blurred and distorted vision,breathlessness,bronchial asthma,burning micturition,cervical spondylosis,...,swelled lymph nodes,throat irritation,typhoid fever,visual disturbances,vomiting,watering from eyes,weakness in limbs,weight loss,yellowing of eyes,yellowish skin
0,Dengue,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,Typhoid Fever,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,Typhoid Fever,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,Allergy,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,Allergy,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [1133]:
df_encoded.shape

(114, 82)

In [1134]:
model_features = df_encoded.columns.tolist()

In [1135]:
model_features.remove("Disease")
X = df_encoded[model_features]
y = df_encoded["Disease"]

In [1136]:
# One-hot encode the target variable 'y' (multi-class)
y_encoded = pd.get_dummies(y)

In [1137]:
# Split the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, random_state=42)
X_eval, X_test, y_eval, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [1138]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

In [1139]:
# Train the model
rf_model.fit(X_train, y_train)

In [1140]:
# Make predictions on the validation set
y_pred = rf_model.predict(X_eval)

In [1141]:
# Evaluate the model's performance
accuracy = accuracy_score(y_eval, y_pred)
print(f"Validation Accuracy: {accuracy}")

Validation Accuracy: 1.0


In [1142]:
# Evaluate on the test set
y_test_pred = rf_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.9333333333333333


In [1143]:
# Making predictions for user input (same as you did with the TensorFlow model)
user_input = ["continuous_sneezing", "chills", "fatigue", "cough", "fever", "headache"]
#user_input = ["acidity", "indigestion", "headache", "blurred_and_distorted_vision", "excessive_hunger", "stiff_neck", "irritability",  "visual_disturbances"]

In [1144]:
def strip_to_basic_tokens(symptoms):
    # Check if symptoms is a list; if not, make it a list
    if isinstance(symptoms, str):
        symptoms = [symptoms]
    
    # Now process the list of symptoms
    symptoms = [symptom.strip().lower().replace(' ', '_').replace('_', ' ') for symptom in symptoms]
    return [re.sub(r'\s+', ' ', symptom) for symptom in symptoms]

In [1145]:
# Strip and preprocess user input
user_input_stripped = strip_to_basic_tokens(user_input)

In [1146]:
# Fit the MultiLabelBinarizer on the entire dataset
mlb = MultiLabelBinarizer(classes=df_encoded.columns)

In [1147]:
# Fit the MultiLabelBinarizer on the original data (or training data)
mlb.fit(df_encoded.columns)

In [1148]:
# Now, transform the user input after the MultiLabelBinarizer has been fitted
user_input_encoded = pd.DataFrame(mlb.transform([user_input_stripped]), columns=mlb.classes_)

In [1149]:
# Concatenate the user input with the original data (adjust for any necessary column alignment)
final_user_input = pd.concat([pd.DataFrame(columns=X.columns), user_input_encoded], axis=0)
final_user_input = final_user_input.drop(['Disease'], axis=1)

In [1150]:
# Print the final user input shape and check the result
final_user_input.head()

Unnamed: 0,abdominal pain,acidity,allergy,back pain,blurred and distorted vision,breathlessness,bronchial asthma,burning micturition,cervical spondylosis,chest pain,...,swelled lymph nodes,throat irritation,typhoid fever,visual disturbances,vomiting,watering from eyes,weakness in limbs,weight loss,yellowing of eyes,yellowish skin
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1151]:
# Predicting the class for the user input
user_pred = rf_model.predict(final_user_input)
predicted_class_index = np.argmax(user_pred)
prediction_encode = y_encoded.columns[predicted_class_index]

In [1152]:
print(f"Predicted Class: {prediction_encode}")

Predicted Class: Allergy


In [1153]:
import pickle

# Save the trained Random Forest model to a pickle file
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)

# Optionally, save the label encoder and MultiLabelBinarizer used for transforming input
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(mapping_data['label_encoder'], le_file)

with open('mlb_encoder.pkl', 'wb') as mlb_file:
    pickle.dump(mlb, mlb_file)


In [1154]:
# Assume 'prediction_encode' is an integer representing the predicted class index
# If using RandomForest, make sure you're using the index of the predicted class

# Let's get the predicted class index from the one-hot encoded output (using np.argmax for multi-class output)
predicted_class_index = np.argmax(user_pred, axis=1)[0]  # Get the index of the max value from the prediction

# Use the index to get the corresponding class label from the y_encoded columns
prediction_encode = y_encoded.columns[predicted_class_index]

# Now, map the prediction using inverse_label_encoding
inverse_label_encoding = {v: k for k, v in label_mapping.items()}  # Reversing label_mapping for disease names
if prediction_encode in inverse_label_encoding:
    prediction = inverse_label_encoding[prediction_encode]
else:
    prediction = "Unknown Disease"  # Fallback message if not found

print(f"Predicted Disease: {prediction}")


Predicted Disease: Unknown Disease


In [1155]:
# Get the human-readable label for the prediction
inverse_label_encoding = {v: k for k, v in label_mapping.items()}
prediction = inverse_label_encoding[prediction_encode]
print(f"Predicted Disease: {prediction}")

KeyError: 'Allergy'

In [None]:
print(f"Predicted Class Index: {predicted_class_index}")
print(f"Predicted Class: {prediction_encode}")
print(f"Inverse Label Encoding: {inverse_label_encoding}")

Predicted Class Index: 0
Predicted Class: Allergy
Inverse Label Encoding: {200: 'Allergy', 201: 'Bronchial Asthma', 202: 'Cervical spondylosis', 203: 'Chronic cholestasis', 204: 'Common Cold', 205: 'Dengue', 206: 'Diabetes ', 207: 'Drug Reaction', 208: 'Fungal infection', 209: 'Gastroenteritis', 210: 'Hypertension ', 211: 'Migraine', 212: 'Peptic ulcer diseae', 213: 'Typhoid Fever'}
