In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

In [3]:
df=pd.read_csv('symbipredict_2022.xls')

In [4]:
df.shape

(4961, 133)

In [5]:
df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection


In [6]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['prognosis']=le.fit_transform(df['prognosis'])

In [7]:
df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14


In [8]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [9]:


# Retrieve and print all column names
column_names = X.columns.tolist()
print(column_names)


['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing', 'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity', 'ulcers_on_tongue', 'muscle_wasting', 'vomiting', 'burning_micturition', 'spotting_ urination', 'fatigue', 'weight_gain', 'anxiety', 'cold_hands_and_feets', 'mood_swings', 'weight_loss', 'restlessness', 'lethargy', 'patches_in_throat', 'irregular_sugar_level', 'cough', 'high_fever', 'sunken_eyes', 'breathlessness', 'sweating', 'dehydration', 'indigestion', 'headache', 'yellowish_skin', 'dark_urine', 'nausea', 'loss_of_appetite', 'pain_behind_the_eyes', 'back_pain', 'constipation', 'abdominal_pain', 'diarrhoea', 'mild_fever', 'yellow_urine', 'yellowing_of_eyes', 'acute_liver_failure', 'fluid_overload', 'swelling_of_stomach', 'swelled_lymph_nodes', 'malaise', 'blurred_and_distorted_vision', 'phlegm', 'throat_irritation', 'redness_of_eyes', 'sinus_pressure', 'runny_nose', 'congestion', 'chest_pain', 'weakness_in_limbs', 'fast_heart_rate', 'pain_during_bow

In [10]:
y

0       14
1       14
2       14
3       14
4       14
        ..
4956    40
4957     1
4958    38
4959    35
4960    27
Name: prognosis, Length: 4961, dtype: int64

In [11]:
# train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [12]:
import warnings
import joblib  # Import joblib for saving the model

warnings.filterwarnings("ignore")
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve 

In [13]:


models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boost": GradientBoostingClassifier()
}

for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train, y_train)  # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred)  # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')  # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred, average='weighted')  # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred, average='weighted')  # Calculate Recall
    model_train_rocauc_score = roc_auc_score(y_train, model.predict_proba(X_train), multi_class='ovr')  # ROC-AUC score

    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred)  # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')  # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred, average='weighted')  # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred, average='weighted')  # Calculate Recall
    model_test_rocauc_score = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')  # ROC-AUC score

    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))

    print('='*35)
    print('\n')
    if list(models.keys())[i] == "Random Forest":
        # Save RandomForest model
        joblib.dump(model, 'random_forest_model.pkl')
        print("Random Forest model saved as 'random_forest_model.pkl'")


Logistic Regression
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000


Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000


Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000


Random Forest model saved as 'random_forest_model

In [14]:
rfmodel = models["Random Forest"]
max(rfmodel.predict_proba(X_test)[8])

np.float64(1.0)

In [15]:
import pandas as pd

# Function to get diet and medication details for a predicted disease
def get_diet_for_disease(predicted_disease):
    # Load the diet CSV into a pandas DataFrame
    df = pd.read_csv('diet.xls')
    
    # Find the row that matches the predicted disease
    disease_data = df[df['Disease'] == predicted_disease]
    
    # If disease data is found, extract the information
    if not disease_data.empty:
        suggested_diet = disease_data['Suggested Diet'].values[0]
        foods_to_eat = disease_data['Foods to Eat'].values[0]
        foods_to_avoid = disease_data['Foods to Avoid'].values[0]
        medication = disease_data['Medication'].values[0]
        
        # Returning the details
        return {
            'Disease': predicted_disease,
            'Suggested Diet': suggested_diet,
            'Foods to Eat': foods_to_eat,
            'Foods to Avoid': foods_to_avoid,
            'Medication': medication
        }
    else:
        # If the disease is not found in the dataset
        return {"Error": "Disease not found in the dataset"}



In [16]:
import joblib
import pandas as pd

# Save the RandomForest model and LabelEncoder
rf_model = models["Random Forest"]
joblib.dump(rf_model, 'random_forest_model.pkl')
joblib.dump(le, 'label_encoder.pkl')

# Save the diet dataset
diet_df = pd.read_csv('diet.xls')
diet_df.to_pickle('diet_data.pkl')

# Loading the model and data
rf_model_loaded = joblib.load('random_forest_model.pkl')
le_loaded = joblib.load('label_encoder.pkl')
diet_df_loaded = pd.read_pickle('diet_data.pkl')


In [17]:
import numpy as np
import joblib

# Load the trained Random Forest model and LabelEncoder
rf_model = joblib.load('random_forest_model.pkl')
le = joblib.load('label_encoder.pkl')

# Example input (1 represents the presence of the symptom, 0 represents absence)
input_example = np.array([1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 
                          0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 
                          0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 
                          1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 
                          1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0])

# Reshape input to match model's input format
input_example = input_example.reshape(1, -1)

# Predict the disease using the trained Random Forest model
predicted_class = rf_model.predict(input_example)

# Reverse the transformation to get the predicted disease name
predicted_disease = le.inverse_transform(predicted_class)

# Display the predicted disease
print(f"Predicted Disease: {predicted_disease[0]}")
diet_info = get_diet_for_disease(predicted_disease[0])

# Display the diet information
print("\nDiet and Medication Details:")
print(f"Suggested Diet: {diet_info['Suggested Diet']}")
print(f"Foods to Eat: {diet_info['Foods to Eat']}")
print(f"Foods to Avoid: {diet_info['Foods to Avoid']}")
print(f"Medication: {diet_info['Medication']}")


Predicted Disease: Common Cold

Diet and Medication Details:
Suggested Diet: Vitamin C-rich diet, Hydration
Foods to Eat: Fruits, Vegetables, Broth-based soups
Foods to Avoid: Alcohol, Dairy, Sugary foods
Medication: Decongestants, Pain relievers
