## Dataset

[Dataset source](https://www.kaggle.com/datasets/rohitadnaik/chatbot-symptom-description?select=symptom_Description.csv)

## Installations

In [32]:
!pip install pandas



## Import

In [33]:
import pandas as pd

In [34]:
symptoms_description_df = pd.read_csv('../data/symptom_Description.csv')
symptoms_precaution_df = pd.read_csv('../data/symptom_precaution.csv')

In [35]:
symptoms_description_df.head()

Unnamed: 0,Drug Reaction,An adverse drug reaction (ADR) is an injury caused by taking medication. ADRs may occur following a single dose or prolonged administration of a drug or result from the combination of two or more drugs.
0,Malaria,An infectious disease caused by protozoan para...
1,Allergy,An allergy is an immune system response to a f...
2,Hypothyroidism,"Hypothyroidism, also called underactive thyroi..."
3,Psoriasis,Psoriasis is a common skin disorder that forms...
4,GERD,"Gastroesophageal reflux disease, or GERD, is a..."


In [36]:
symptoms_precaution_df.head()

Unnamed: 0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
0,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
1,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
2,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
3,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths
4,GERD,avoid fatty spicy food,avoid lying down after eating,maintain healthy weight,exercise


## Replace NaNs With Empty String

In [37]:
symptoms_description_df.replace(to_replace=pd.NA, value='', inplace=True)
symptoms_precaution_df.replace(to_replace=pd.NA, value='', inplace=True)

In [38]:
symptoms_description_df.isna().sum()

Drug Reaction                                                                                                                                                                                                 0
An adverse drug reaction (ADR) is an injury caused by taking medication. ADRs may occur following a single dose or prolonged administration of a drug or result from the combination of two or more drugs.    0
dtype: int64

In [39]:
symptoms_precaution_df.isna().sum()

Drug Reaction               0
stop irritation             0
consult nearest hospital    0
stop taking drug            0
follow up                   0
dtype: int64

## Rename column Headers

In [40]:
symptoms_description_df.rename(columns={'Drug Reaction': 'Disease', 'An adverse drug reaction (ADR) is an injury caused by taking medication. ADRs may occur following a single dose or prolonged administration of a drug or result from the combination of two or more drugs.': 'Description'}, inplace=True)
symptoms_precaution_df.rename(columns={'Drug Reaction': 'Disease'}, inplace=True)


In [41]:
symptoms_description_df.columns

Index(['Disease', 'Description'], dtype='object')

In [42]:
symptoms_precaution_df.columns

Index(['Disease', 'stop irritation', 'consult nearest hospital',
       'stop taking drug', 'follow up'],
      dtype='object')

## Combine Data Frames

In [43]:
symptoms_description_df.shape

(40, 2)

In [44]:
symptoms_precaution_df.shape

(40, 5)

In [48]:
df = pd.merge(symptoms_description_df, symptoms_precaution_df, on='Disease', how='inner')

In [49]:
df.shape

(37, 6)

In [50]:
df.head()

Unnamed: 0,Disease,Description,stop irritation,consult nearest hospital,stop taking drug,follow up
0,Malaria,An infectious disease caused by protozoan para...,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
1,Allergy,An allergy is an immune system response to a f...,apply calamine,cover area with bandage,,use ice to compress itching
2,Hypothyroidism,"Hypothyroidism, also called underactive thyroi...",reduce stress,exercise,eat healthy,get proper sleep
3,Psoriasis,Psoriasis is a common skin disorder that forms...,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths
4,GERD,"Gastroesophageal reflux disease, or GERD, is a...",avoid fatty spicy food,avoid lying down after eating,maintain healthy weight,exercise


In [53]:
df.rename(columns={
    'Disease': 'Disease',
    'Description': 'Description',
    'stop irritation': 'Immediate Action',
    'consult nearest hospital': 'Medical Advice',
    'stop taking drug': 'Dietary Guidance',
    'follow up': 'Long-term Management'
}, inplace=True)

df.head()

Unnamed: 0,Disease,Description,Immediate Action,Medical Advice,Dietary Guidance,Long-term Management
0,Malaria,An infectious disease caused by protozoan para...,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
1,Allergy,An allergy is an immune system response to a f...,apply calamine,cover area with bandage,,use ice to compress itching
2,Hypothyroidism,"Hypothyroidism, also called underactive thyroi...",reduce stress,exercise,eat healthy,get proper sleep
3,Psoriasis,Psoriasis is a common skin disorder that forms...,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths
4,GERD,"Gastroesophageal reflux disease, or GERD, is a...",avoid fatty spicy food,avoid lying down after eating,maintain healthy weight,exercise


In [65]:
def print_rows(df):
    """
    Print each row of a DataFrame in the format:
    ColumnName: Content in bold
    
    Args:
    df (pandas.DataFrame): The DataFrame to print
    """
    for index, row in df.iterrows():
        for column in df.columns:
            print(f'{column}: {row[column]}')
        print()  # Empty line between rows

In [66]:
print_rows(df)

Disease: Malaria
Description: An infectious disease caused by protozoan parasites from the Plasmodium family that can be transmitted by the bite of the Anopheles mosquito or by a contaminated needle or transfusion. Falciparum malaria is the most deadly type.
Immediate Action: Consult nearest hospital
Medical Advice: avoid oily food
Dietary Guidance: avoid non veg food
Long-term Management: keep mosquitos out

Disease: Allergy
Description: An allergy is an immune system response to a foreign substance that's not typically harmful to your body.They can include certain foods, pollen, or pet dander. Your immune system's job is to keep you healthy by fighting harmful pathogens.
Immediate Action: apply calamine
Medical Advice: cover area with bandage
Dietary Guidance: 
Long-term Management: use ice to compress itching

Disease: Hypothyroidism
Description: Hypothyroidism, also called underactive thyroid or low thyroid, is a disorder of the endocrine system in which the thyroid gland does not 

## Prepare For Embedding

In [69]:
def prepare_new_column(data):
    """
    Prepares the given disease data for embedding by combining all fields into a single string.
    
    Args:
    data (dict): A dictionary containing disease information
    
    Returns:
    str: A single string containing all the disease information
    """
    fields = [
        f"Disease: {data['Disease']}",
        f"Description: {data['Description']}",
        f"Immediate Action: {data['Immediate Action']}",
        f"Medical Advice: {data['Medical Advice']}",
        f"Dietary Guidance: {data['Dietary Guidance']}",
        f"Long-term Management: {data['Long-term Management']}"
    ]
    
    return " ".join(fields)

In [71]:
df['prepared_text'] = df.apply(lambda row: prepare_new_column(row.to_dict()), axis=1)

In [72]:
df.head()

Unnamed: 0,Disease,Description,Immediate Action,Medical Advice,Dietary Guidance,Long-term Management,prepared_text
0,Malaria,An infectious disease caused by protozoan para...,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out,Disease: Malaria Description: An infectious di...
1,Allergy,An allergy is an immune system response to a f...,apply calamine,cover area with bandage,,use ice to compress itching,Disease: Allergy Description: An allergy is an...
2,Hypothyroidism,"Hypothyroidism, also called underactive thyroi...",reduce stress,exercise,eat healthy,get proper sleep,Disease: Hypothyroidism Description: Hypothyro...
3,Psoriasis,Psoriasis is a common skin disorder that forms...,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths,Disease: Psoriasis Description: Psoriasis is a...
4,GERD,"Gastroesophageal reflux disease, or GERD, is a...",avoid fatty spicy food,avoid lying down after eating,maintain healthy weight,exercise,Disease: GERD Description: Gastroesophageal re...


## Final Data Inspection

In [73]:
df.isna().sum()

Disease                 0
Description             0
Immediate Action        0
Medical Advice          0
Dietary Guidance        0
Long-term Management    0
prepared_text           0
dtype: int64

## Export To CSV

In [74]:
df.to_csv('../data/cleaned_data.csv', index=False)