# ðŸ—“ Task 1 â€” Data Understanding & Loading

Objective: Load both datasets, view shape, columns, and missing values.

In [26]:
# import Necessary library
import pandas as pd
import numpy as np
from langchain.text_splitter import CharacterTextSplitter
import os

# Explore Symptoms dataset

In [11]:
#Load dataset 
symptoms_df = pd.read_csv(r"C:\Users\PMYLS\Pictures\Documents\AI-Health-Assistant\data\raw\DiseaseAndSymptoms.csv")
precautions_df = pd.read_csv(r"C:\Users\PMYLS\Pictures\Documents\AI-Health-Assistant\data\raw\Disease precaution.csv")

# Overview
print(symptoms_df.shape)
print(symptoms_df.columns)
#print(symptoms_df.head())
print(symptoms_df.info())

#Check missing values
print(symptoms_df.isnull().sum())

(4920, 18)
Index(['Disease', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4',
       'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
       'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14',
       'Symptom_15', 'Symptom_16', 'Symptom_17'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Disease     4920 non-null   object
 1   Symptom_1   4920 non-null   object
 2   Symptom_2   4920 non-null   object
 3   Symptom_3   4920 non-null   object
 4   Symptom_4   4572 non-null   object
 5   Symptom_5   3714 non-null   object
 6   Symptom_6   2934 non-null   object
 7   Symptom_7   2268 non-null   object
 8   Symptom_8   1944 non-null   object
 9   Symptom_9   1692 non-null   object
 10  Symptom_10  1512 non-null   object
 11  Symptom_11  1194 non-null   object
 12  Symptom_12  744 non-null  

# Explore precautions

In [12]:
# Overview
print(precautions_df.shape)
print(precautions_df.columns)
#print(precautions_df.head())
print(precautions_df.info())
#Check missing values
print(precautions_df.isnull().sum())

(41, 5)
Index(['Disease', 'Precaution_1', 'Precaution_2', 'Precaution_3',
       'Precaution_4'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Disease       41 non-null     object
 1   Precaution_1  41 non-null     object
 2   Precaution_2  41 non-null     object
 3   Precaution_3  40 non-null     object
 4   Precaution_4  40 non-null     object
dtypes: object(5)
memory usage: 1.7+ KB
None
Disease         0
Precaution_1    0
Precaution_2    0
Precaution_3    1
Precaution_4    1
dtype: int64


# ðŸ—“ Task 2 â€” Data Cleaning

Objective: Handle NaN values, remove duplicates, and format text cleanly.

In [13]:
# Normalize Disease column FIRST 
def clean_string_cell(x):
    # keep NaN as np.nan (do not convert to "nan" string)
    if pd.isna(x):
        return np.nan
    if isinstance(x, str):
        tx = x.strip().lower()
        return tx if tx != "" else np.nan
    # other types -> convert to str then clean
    return str(x).strip().lower()

# Apply to Disease columns only (to preserve structure)
symptoms_df['Disease'] = symptoms_df['Disease'].apply(clean_string_cell)
precautions_df['Disease'] = precautions_df['Disease'].apply(clean_string_cell)

# ------------- 2) Clean all string cells in each DF (but keep NaN as NaN) -------------
def clean_dataframe_strings(df):
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].apply(lambda x: clean_string_cell(x))
    return df

symptoms_df = clean_dataframe_strings(symptoms_df)
precautions_df = clean_dataframe_strings(precautions_df)

#  Merge on cleaned 'disease'
merged = pd.merge(symptoms_df, precautions_df, on='Disease', how='left', suffixes=('_sym', '_prec'))

#  Create clean lists for symptoms & precautions
# symptom columns (detect columns that contain 'symptom' ignoring case)
symptom_cols = [c for c in merged.columns if 'symptom' in c.lower()]
precaution_cols = [c for c in merged.columns if 'precaution' in c.lower()]

def to_clean_list(row_vals):
    cleaned = [] 
    for v in row_vals:
        if pd.isna(v):
            continue
        # remove internal extra spaces, underscores => replace with space if you want, or keep underscore
        s = str(v).strip().lower()
        if s in ("", "nan", "none"):
            continue
        cleaned.append(s)
    # remove duplicates while preserving order
    seen = set()
    uniq = []
    for item in cleaned:
        if item not in seen:
            uniq.append(item)
            seen.add(item)
    return uniq


merged['all_symptoms'] = merged[symptom_cols].apply(lambda r: to_clean_list(r.values), axis=1)
merged['precautions_list'] = merged[precaution_cols].apply(lambda r: to_clean_list(r.values), axis=1)

#Normalize list order (sort) to help dedupe (optional) -------------
merged['all_symptoms_sorted'] = merged['all_symptoms'].apply(lambda lst: sorted(lst))
merged['precautions_sorted'] = merged['precautions_list'].apply(lambda lst: sorted(lst))

#Drop rows with no disease name -------------
merged = merged[~merged['Disease'].isna()].reset_index(drop=True)

# Convert lists in these columns to tuples so they can be hashed for drop_duplicates
merged['all_symptoms_sorted'] = merged['all_symptoms_sorted'].apply(tuple)
merged['precautions_sorted'] = merged['precautions_sorted'].apply(tuple)

# Now this line will work without the TypeError:
merged = merged.drop_duplicates(subset=['Disease', 'all_symptoms_sorted', 'precautions_sorted']).reset_index(drop=True)

#  Final keep columns and nicer formatting 
final_df = merged[['Disease', 'all_symptoms', 'precautions_list']].rename(
    columns={'precautions_list': 'precautions'}
)

# Optional: Title case disease for display (not for matching)
final_df['Disease_display'] = final_df['Disease'].apply(lambda x: x.title())

# Save cleaned files 
final_df.to_csv(r"C:\Users\PMYLS\Pictures\Documents\AI-Health-Assistant\data\cleaned/cleaned_health_data.csv", index=False)
final_df.to_json(r"C:\Users\PMYLS\Pictures\Documents\AI-Health-Assistant\data\cleaned/cleaned_health_data.json", orient='records', force_ascii=False)

#  Quick inspection prints 
print("Number of unique diseases:", final_df['Disease'].nunique())
print("Sample rows:")
display(final_df.head(10))


Number of unique diseases: 41
Sample rows:


Unnamed: 0,Disease,all_symptoms,precautions,Disease_display
0,fungal infection,"[itching, skin_rash, nodal_skin_eruptions, dis...","[bath twice, use detol or neem in bathing wate...",Fungal Infection
1,fungal infection,"[skin_rash, nodal_skin_eruptions, dischromic _...","[bath twice, use detol or neem in bathing wate...",Fungal Infection
2,fungal infection,"[itching, nodal_skin_eruptions, dischromic _pa...","[bath twice, use detol or neem in bathing wate...",Fungal Infection
3,fungal infection,"[itching, skin_rash, dischromic _patches]","[bath twice, use detol or neem in bathing wate...",Fungal Infection
4,fungal infection,"[itching, skin_rash, nodal_skin_eruptions]","[bath twice, use detol or neem in bathing wate...",Fungal Infection
5,allergy,"[continuous_sneezing, shivering, chills, water...","[apply calamine, cover area with bandage, use ...",Allergy
6,allergy,"[shivering, chills, watering_from_eyes]","[apply calamine, cover area with bandage, use ...",Allergy
7,allergy,"[continuous_sneezing, chills, watering_from_eyes]","[apply calamine, cover area with bandage, use ...",Allergy
8,allergy,"[continuous_sneezing, shivering, watering_from...","[apply calamine, cover area with bandage, use ...",Allergy
9,allergy,"[continuous_sneezing, shivering, chills]","[apply calamine, cover area with bandage, use ...",Allergy


In [14]:
def format_knowledge(row):
    
    # Correction applied here:
    # We use a generator expression to convert every item to a string before joining.
    symptoms = ", ".join(str(symptom) for symptom in row["all_symptoms"])
    
    # Correction applied here:
    precautions = ", ".join(str(p) for p in row["precautions"])
    
    return f"Disease: {row['Disease']}\nSymptoms: {symptoms}\nPrecautions: {precautions}\n"

# The rest of your code remains the same
knowledge_texts = final_df.apply(format_knowledge, axis=1)
with open(r"C:\Users\PMYLS\Pictures\Documents\AI-Health-Assistant\data/medical_knowledge_base.txt", "w", encoding="utf-8") as f:
  f.write("\n".join(knowledge_texts))
print(type(knowledge_texts))
knowledge_texts.head()



<class 'pandas.core.series.Series'>


0    Disease: fungal infection\nSymptoms: itching, ...
1    Disease: fungal infection\nSymptoms: skin_rash...
2    Disease: fungal infection\nSymptoms: itching, ...
3    Disease: fungal infection\nSymptoms: itching, ...
4    Disease: fungal infection\nSymptoms: itching, ...
dtype: object

# Milestone 4: Text Chunking for RAG

**Objective:** Convert text data into manageable chunks for embeddings and retrieval.

In [None]:

# Define path
DATA_PATH = r"C:\Users\PMYLS\Pictures\Documents\AI-Health-Assistant\data\cleaned\cleaned_health_data.csv"
CHUNK_OUTPUT_PATH = r"C:\Users\PMYLS\Pictures\Documents\AI-Health-Assistant\data\chunks\chunks.csv"


# Step 1: Load cleaned dataset
df = pd.read_csv(DATA_PATH)

# Step 2: Create readable text format for each disease
def create_text_entry(row):
    symptoms = ", ".join(eval(row["all_symptoms"])) if isinstance(row["all_symptoms"], str) else ""
    precautions = ", ".join(eval(row["precautions"])) if isinstance(row["precautions"], str) else ""
    text = (
        f"Disease: {row['Disease']}\n"
        f"Symptoms: {symptoms}\n"
        f"Precautions: {precautions}\n"
    )
    return text.strip()

df["combined_text"] = df.apply(create_text_entry, axis=1)

# Step 3: Initialize LangChain text splitter
splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=800,     # 500â€“1000 tokens range
    chunk_overlap=50,
    length_function=len
)

# Step 4: Split each entry into chunks
chunks = []
for i, row in df.iterrows():
    docs = splitter.split_text(row["combined_text"])
    for j, chunk in enumerate(docs):
        chunks.append({
            "disease": row["Disease"],
            "chunk_id": f"{row['Disease']}_{j}",
            "text_chunk": chunk
        })

# Step 5: Store chunks in DataFrame
chunks_df = pd.DataFrame(chunks)

# Step 6: Save for embedding
os.makedirs("../data", exist_ok=True)
chunks_df.to_csv(CHUNK_OUTPUT_PATH, index=False)

print(f"âœ… Chunk creation complete. Saved {len(chunks_df)} chunks to {CHUNK_OUTPUT_PATH}")
print(chunks_df)


âœ… Chunk creation complete. Saved 304 chunks to C:\Users\PMYLS\Pictures\Documents\AI-Health-Assistant\data\chunks\chunks.csv
              disease            chunk_id  \
0    fungal infection  fungal infection_0   
1    fungal infection  fungal infection_0   
2    fungal infection  fungal infection_0   
3    fungal infection  fungal infection_0   
4    fungal infection  fungal infection_0   
..                ...                 ...   
299          impetigo          impetigo_0   
300          impetigo          impetigo_0   
301          impetigo          impetigo_0   
302          impetigo          impetigo_0   
303          impetigo          impetigo_0   

                                            text_chunk  
0    Disease: fungal infection\nSymptoms: itching, ...  
1    Disease: fungal infection\nSymptoms: skin_rash...  
2    Disease: fungal infection\nSymptoms: itching, ...  
3    Disease: fungal infection\nSymptoms: itching, ...  
4    Disease: fungal infection\nSymptoms: itchin