In [134]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import accuracy_score

In [135]:
# Load and preprocess data
main_df = pd.read_csv("DATA/NewDB.csv")

In [136]:
main_df.head(10)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Allergy,continuous_sneezing,shivering,chills,watering_from_eyes,,,,,,,,,,,,,
1,Allergy,shivering,chills,watering_from_eyes,,,,,,,,,,,,,,
2,Allergy,continuous_sneezing,chills,watering_from_eyes,,,,,,,,,,,,,,
3,Allergy,continuous_sneezing,shivering,watering_from_eyes,,,,,,,,,,,,,,
4,Allergy,continuous_sneezing,shivering,chills,,,,,,,,,,,,,,
5,Allergy,continuous_sneezing,shivering,chills,watering_from_eyes,,,,,,,,,,,,,
6,Common Cold,continuous_sneezing,sore_throat,cough,,,,,,,,,,,,,,
7,Common Cold,continuous_sneezing,sore_throat,cough,blocked_nose,,,,,,,,,,,,,
8,Common Cold,continuous_sneezing,sore_throat,cough,runny_nose,,,,,,,,,,,,,
9,Common Cold,continuous_sneezing,sore_throat,cough,feeling_tired,,,,,,,,,,,,,


In [137]:
main_df.shape

(32, 18)

In [138]:
main_df.Disease.value_counts()

Disease
Common Cold    14
Gastritis      12
Allergy         6
Name: count, dtype: int64

In [139]:
df = main_df.copy()

In [140]:
df.dropna(axis=1, how='all', inplace=True)

In [141]:
df.fillna(0, inplace=True)

In [142]:
class CustomLabelEncoder(LabelEncoder):
    def __init__(self, start = 0):
        self.start = start
        super().__init__()

    def fit_transform(self, y):
        encoded = super().fit_transform(y)
        encoded += self.start
        return encoded

In [143]:
flattened_series = df['Disease'].astype(str)

In [144]:
encoder = CustomLabelEncoder(start=200) 

In [145]:
encoded_values = encoder.fit_transform(flattened_series)

In [146]:
df['Disease'] = encoded_values

In [147]:
mapping_data = {'label_encoder' : encoder}

In [148]:
label_mapping = {k: v for k, v in zip(mapping_data['label_encoder'].classes_, range(200, 200+len(mapping_data['label_encoder'].classes_)))}

In [149]:
label_mapping

{'Allergy': 200, 'Common Cold': 201, 'Gastritis': 202}

In [150]:
df = main_df.copy()

In [151]:
df

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Allergy,continuous_sneezing,shivering,chills,watering_from_eyes,,,,,,,,,,,,,
1,Allergy,shivering,chills,watering_from_eyes,,,,,,,,,,,,,,
2,Allergy,continuous_sneezing,chills,watering_from_eyes,,,,,,,,,,,,,,
3,Allergy,continuous_sneezing,shivering,watering_from_eyes,,,,,,,,,,,,,,
4,Allergy,continuous_sneezing,shivering,chills,,,,,,,,,,,,,,
5,Allergy,continuous_sneezing,shivering,chills,watering_from_eyes,,,,,,,,,,,,,
6,Common Cold,continuous_sneezing,sore_throat,cough,,,,,,,,,,,,,,
7,Common Cold,continuous_sneezing,sore_throat,cough,blocked_nose,,,,,,,,,,,,,
8,Common Cold,continuous_sneezing,sore_throat,cough,runny_nose,,,,,,,,,,,,,
9,Common Cold,continuous_sneezing,sore_throat,cough,feeling_tired,,,,,,,,,,,,,


In [152]:
df['All Symptoms'] = df.apply(lambda row: ','.join(row.dropna()), axis=1)
df['All Symptoms'] = df['All Symptoms'].apply(lambda x: ','.join(sorted(set(x.split(','))) if x else ''))

In [153]:
df['All Symptoms']

0      chills, continuous_sneezing, shivering, water...
1         chills, shivering, watering_from_eyes,Allergy
2      chills, continuous_sneezing, watering_from_ey...
3      continuous_sneezing, shivering, watering_from...
4        chills, continuous_sneezing, shivering,Allergy
5      chills, continuous_sneezing, shivering, water...
6     Common Cold,continuous_sneezing,cough,sore_throat
7     Common Cold,blocked_nose,continuous_sneezing,c...
8     Common Cold,continuous_sneezing,cough,runny_no...
9     Common Cold,continuous_sneezing,cough,feeling_...
10    Common Cold,continuous_sneezing,cough,feeling_...
11    Common Cold,blocked_nose ,continuous_sneezing,...
12    Common Cold,continuous_sneezing,cough,feeling_...
13    Common Cold,blocked_nose,continuous_sneezing,c...
14    Common Cold,continuous_sneezing,cough,feeling_...
15    Common Cold,continuous_sneezing,cough,feeling_...
16    Common Cold,continuous_sneezing,cough,feeling_...
17    Common Cold,continuous_sneezing,cough,feel

In [154]:
stay_cols= ['Disease', 'All Symptoms']
stay_cols

['Disease', 'All Symptoms']

In [155]:
df = df[stay_cols]

In [156]:
df.head()

Unnamed: 0,Disease,All Symptoms
0,Allergy,"chills, continuous_sneezing, shivering, water..."
1,Allergy,"chills, shivering, watering_from_eyes,Allergy"
2,Allergy,"chills, continuous_sneezing, watering_from_ey..."
3,Allergy,"continuous_sneezing, shivering, watering_from..."
4,Allergy,"chills, continuous_sneezing, shivering,Allergy"


In [157]:
df['All Symptoms'][0]

' chills, continuous_sneezing, shivering, watering_from_eyes,Allergy'

In [158]:
import re

In [159]:
def strip_to_basic_tokens(text):
    # Remove doble spaces and underscores
    text = re.sub(r'[_\s]+', ' ', text)
    # Split by commas and lowercase the tokens
    tokens = [token.strip().lower() for token in text.split(',')]
    return tokens

In [160]:
df['Basic Tokens'] = df['All Symptoms'].apply(strip_to_basic_tokens)
df['Basic Tokens'] = df['Basic Tokens'].apply(lambda x: ', '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Basic Tokens'] = df['All Symptoms'].apply(strip_to_basic_tokens)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Basic Tokens'] = df['Basic Tokens'].apply(lambda x: ', '.join(x))


In [161]:
df.head()

Unnamed: 0,Disease,All Symptoms,Basic Tokens
0,Allergy,"chills, continuous_sneezing, shivering, water...","chills, continuous sneezing, shivering, wateri..."
1,Allergy,"chills, shivering, watering_from_eyes,Allergy","chills, shivering, watering from eyes, allergy"
2,Allergy,"chills, continuous_sneezing, watering_from_ey...","chills, continuous sneezing, watering from eye..."
3,Allergy,"continuous_sneezing, shivering, watering_from...","continuous sneezing, shivering, watering from ..."
4,Allergy,"chills, continuous_sneezing, shivering,Allergy","chills, continuous sneezing, shivering, allergy"


In [162]:
df = df.drop(['All Symptoms'], axis = 1)

In [163]:
df.head()

Unnamed: 0,Disease,Basic Tokens
0,Allergy,"chills, continuous sneezing, shivering, wateri..."
1,Allergy,"chills, shivering, watering from eyes, allergy"
2,Allergy,"chills, continuous sneezing, watering from eye..."
3,Allergy,"continuous sneezing, shivering, watering from ..."
4,Allergy,"chills, continuous sneezing, shivering, allergy"


In [164]:
df['Basic Tokens'][0]

'chills, continuous sneezing, shivering, watering from eyes, allergy'

In [165]:
dfE = df.copy() # Taking a copy because we never know what might happen
dfE['Basic Tokens'] = dfE['Basic Tokens'].apply(lambda x: x.split(', '))

In [166]:
dfE['Basic Tokens'].head()

0    [chills, continuous sneezing, shivering, water...
1     [chills, shivering, watering from eyes, allergy]
2    [chills, continuous sneezing, watering from ey...
3    [continuous sneezing, shivering, watering from...
4    [chills, continuous sneezing, shivering, allergy]
Name: Basic Tokens, dtype: object

In [167]:
mlb = MultiLabelBinarizer()
# Fit and transform the 'Basic Tokens' column
one_hot_encoded = pd.DataFrame(mlb.fit_transform(dfE['Basic Tokens']), columns=mlb.classes_, index=df.index)

In [168]:
# Concatenate the one-hot encoded DataFrame with the original DataFrame
df_encoded = pd.concat([dfE, one_hot_encoded], axis=1)

In [169]:
df_encoded

Unnamed: 0,Disease,Basic Tokens,allergy,blocked nose,body aches,chills,common cold,continuous sneezing,cough,cramping,...,loss of appetite,loss of smell,loss of taste,muscle pain,nausea,runny nose,shivering,sore throat,vomiting,watering from eyes
0,Allergy,"[chills, continuous sneezing, shivering, water...",1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
1,Allergy,"[chills, shivering, watering from eyes, allergy]",1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,Allergy,"[chills, continuous sneezing, watering from ey...",1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Allergy,"[continuous sneezing, shivering, watering from...",1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
4,Allergy,"[chills, continuous sneezing, shivering, allergy]",1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
5,Allergy,"[chills, continuous sneezing, shivering, water...",1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
6,Common Cold,"[common cold, continuous sneezing, cough, sore...",0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,1,0,0
7,Common Cold,"[common cold, blocked nose, continuous sneezin...",0,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,1,0,0
8,Common Cold,"[common cold, continuous sneezing, cough, runn...",0,0,0,0,1,1,1,0,...,0,0,0,0,0,1,0,1,0,0
9,Common Cold,"[common cold, continuous sneezing, cough, feel...",0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,1,0,0


In [170]:
# Drop the 'Basic Tokens' column
df_encoded = df_encoded.drop(columns=['Basic Tokens'])
df_encoded.head()

Unnamed: 0,Disease,allergy,blocked nose,body aches,chills,common cold,continuous sneezing,cough,cramping,diarrhea,...,loss of appetite,loss of smell,loss of taste,muscle pain,nausea,runny nose,shivering,sore throat,vomiting,watering from eyes
0,Allergy,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,Allergy,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,Allergy,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Allergy,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
4,Allergy,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [171]:
df_encoded.shape

(32, 25)

In [172]:
model_features = df_encoded.columns.tolist()

In [173]:
model_features.remove("Disease")
X = df_encoded[model_features]
y = df_encoded["Disease"]

In [174]:
# One-hot encode the target variable 'y' (multi-class)
y_encoded = pd.get_dummies(y)

In [175]:
# Split the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, random_state=42)
X_eval, X_test, y_eval, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [176]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

In [177]:
# Train the model
rf_model.fit(X_train, y_train)

In [178]:
# Make predictions on the validation set
y_pred = rf_model.predict(X_eval)

In [179]:
# Evaluate the model's performance
accuracy = accuracy_score(y_eval, y_pred)
print(f"Validation Accuracy: {accuracy}")

Validation Accuracy: 1.0


In [180]:
# Evaluate on the test set
y_test_pred = rf_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 1.0


In [181]:
# Making predictions for user input (same as you did with the TensorFlow model)
#user_input = ["continuous_sneezing", "chills", "fatigue", "cough", "fever", "headache"]
#user_input = ["acidity", "indigestion", "headache", "blurred_and_distorted_vision", "excessive_hunger", "stiff_neck", "irritability",  "visual_disturbances"]

#user_input = ["continuous_sneezing", "chills", "fatigue", "cough"]
#user_input = ["continuous_sneezing", "shivering", "watering_from_eyes"]
user_input = ["chills","nausea","vomiting","fever"]


In [182]:
def strip_to_basic_tokens(symptoms):
    # Check if symptoms is a list; if not, make it a list
    if isinstance(symptoms, str):
        symptoms = [symptoms]
    
    # Now process the list of symptoms
    symptoms = [symptom.strip().lower().replace(' ', '_').replace('_', ' ') for symptom in symptoms]
    return [re.sub(r'\s+', ' ', symptom) for symptom in symptoms]

In [183]:
# Strip and preprocess user input
user_input_stripped = strip_to_basic_tokens(user_input)

In [184]:
# Fit the MultiLabelBinarizer on the entire dataset
mlb = MultiLabelBinarizer(classes=df_encoded.columns)

In [185]:
# Fit the MultiLabelBinarizer on the original data (or training data)
mlb.fit(df_encoded.columns)

In [186]:
# Now, transform the user input after the MultiLabelBinarizer has been fitted
user_input_encoded = pd.DataFrame(mlb.transform([user_input_stripped]), columns=mlb.classes_)

In [187]:
# Concatenate the user input with the original data (adjust for any necessary column alignment)
final_user_input = pd.concat([pd.DataFrame(columns=X.columns), user_input_encoded], axis=0)
final_user_input = final_user_input.drop(['Disease'], axis=1)

In [188]:
# Print the final user input shape and check the result
final_user_input.head()

Unnamed: 0,allergy,blocked nose,body aches,chills,common cold,continuous sneezing,cough,cramping,diarrhea,fatigue,...,loss of appetite,loss of smell,loss of taste,muscle pain,nausea,runny nose,shivering,sore throat,vomiting,watering from eyes
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [189]:
import pickle

# Save the trained Random Forest model to a pickle file
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)

# Optionally, save the label encoder and MultiLabelBinarizer used for transforming input
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(mapping_data['label_encoder'], le_file)

with open('mlb_encoder.pkl', 'wb') as mlb_file:
    pickle.dump(mlb, mlb_file)


In [190]:
# Predicting the class for the user input
user_pred = rf_model.predict(final_user_input)
predicted_class_index = np.argmax(user_pred)
prediction_encode = y_encoded.columns[predicted_class_index]

In [191]:
print(f"Predicted Class: {prediction_encode}")

Predicted Class: Gastritis


In [192]:
# Assume 'prediction_encode' is an integer representing the predicted class index
# If using RandomForest, make sure you're using the index of the predicted class

# Let's get the predicted class index from the one-hot encoded output (using np.argmax for multi-class output)
predicted_class_index = np.argmax(user_pred, axis=1)[0]  # Get the index of the max value from the prediction

# Use the index to get the corresponding class label from the y_encoded columns
prediction_encode = y_encoded.columns[predicted_class_index]

# Now, map the prediction using inverse_label_encoding
inverse_label_encoding = {v: k for k, v in label_mapping.items()}  # Reversing label_mapping for disease names
if prediction_encode in inverse_label_encoding:
    prediction = inverse_label_encoding[prediction_encode]
else:
    prediction = "Unknown Disease"  # Fallback message if not found

print(f"Predicted Disease: {prediction}")


Predicted Disease: Unknown Disease


In [193]:
print(f"Predicted Class: {prediction_encode}")

Predicted Class: Gastritis


In [194]:
# Get the human-readable label for the prediction
inverse_label_encoding = {v: k for k, v in label_mapping.items()}
prediction = inverse_label_encoding[prediction_encode]
print(f"Predicted Disease: {prediction}")

KeyError: 'Gastritis'

In [None]:
print(f"Predicted Class Index: {predicted_class_index}")
print(f"Predicted Class: {prediction_encode}")
print(f"Inverse Label Encoding: {inverse_label_encoding}")

Predicted Class Index: 0
Predicted Class: Allergy
Inverse Label Encoding: {200: 'Allergy', 201: 'Bronchial Asthma', 202: 'Cervical spondylosis', 203: 'Chronic cholestasis', 204: 'Common Cold', 205: 'Dengue', 206: 'Diabetes ', 207: 'Drug Reaction', 208: 'Fungal infection', 209: 'Gastroenteritis', 210: 'Hypertension ', 211: 'Migraine', 212: 'Peptic ulcer diseae', 213: 'Typhoid Fever'}
