Importing Libraries

In [706]:
import pandas as pd

In [707]:
# Step 1: Load the datasets
disease_symptoms = pd.read_csv("../data/disease_symptoms.csv")
symptom_description = pd.read_csv("../data/symptom_description.csv")
symptom_precaution = pd.read_csv("../data/symptom_precaution.csv")
symptom_severity = pd.read_csv("../data/symptom_severity.csv")

In [708]:
# Step 2: Normalize column names
disease_symptoms.columns = disease_symptoms.columns.str.strip().str.lower()
symptom_description.columns = symptom_description.columns.str.strip().str.lower()
symptom_precaution.columns = symptom_precaution.columns.str.strip().str.lower()
symptom_severity.columns = symptom_severity.columns.str.strip().str.lower()

In [709]:
# Step 3: Convert values to lowercase for below columns
disease_symptoms = disease_symptoms.map(lambda x: x.strip().lower() if isinstance(x, str) else x)
symptom_description['disease'] = symptom_description['disease'].str.strip().str.lower()
symptom_precaution = symptom_precaution.map(lambda x: x.strip().lower() if isinstance(x, str) else x)
symptom_severity.columns = symptom_severity.columns.str.strip().str.lower()

In [710]:
disease_symptoms.isnull().sum()

disease          0
symptom_1        0
symptom_2        0
symptom_3        0
symptom_4      348
symptom_5     1206
symptom_6     1986
symptom_7     2652
symptom_8     2976
symptom_9     3228
symptom_10    3408
symptom_11    3726
symptom_12    4176
symptom_13    4416
symptom_14    4614
symptom_15    4680
symptom_16    4728
symptom_17    4848
dtype: int64

In [711]:
# Verify the data types now
print(disease_symptoms.dtypes)

disease       object
symptom_1     object
symptom_2     object
symptom_3     object
symptom_4     object
symptom_5     object
symptom_6     object
symptom_7     object
symptom_8     object
symptom_9     object
symptom_10    object
symptom_11    object
symptom_12    object
symptom_13    object
symptom_14    object
symptom_15    object
symptom_16    object
symptom_17    object
dtype: object


In [712]:
symptom_cols = [col for col in disease_symptoms.columns if col.startswith("symptom_")]
symptom_cols

['symptom_1',
 'symptom_2',
 'symptom_3',
 'symptom_4',
 'symptom_5',
 'symptom_6',
 'symptom_7',
 'symptom_8',
 'symptom_9',
 'symptom_10',
 'symptom_11',
 'symptom_12',
 'symptom_13',
 'symptom_14',
 'symptom_15',
 'symptom_16',
 'symptom_17']

In [713]:
disease_symptoms.shape

(4920, 18)

In [714]:
disease_symptoms.duplicated().sum()

np.int64(4616)

In [715]:
disease_symptoms = disease_symptoms.drop_duplicates()

In [716]:
disease_symptoms.shape

(304, 18)

In [717]:
# Melt the DataFrame to gather all symptom columns into one column
melted_df = disease_symptoms.melt(id_vars=['disease'], value_vars=symptom_cols, var_name='symptom_column', value_name='symptom')

# Drop NaN values to avoid including them in the list
melted_df = melted_df.dropna(subset=['symptom'])

# Group by disease and collect unique symptoms into a list
disease_symptom_list = melted_df.groupby('disease')['symptom'].agg(lambda x: list(x.unique())).reset_index()

# Display the result
print(disease_symptom_list)


                                    disease  \
0   (vertigo) paroymsal  positional vertigo   
1                                      acne   
2                                      aids   
3                       alcoholic hepatitis   
4                                   allergy   
5                                 arthritis   
6                          bronchial asthma   
7                      cervical spondylosis   
8                               chicken pox   
9                       chronic cholestasis   
10                              common cold   
11                                   dengue   
12                                 diabetes   
13             dimorphic hemmorhoids(piles)   
14                            drug reaction   
15                         fungal infection   
16                          gastroenteritis   
17                                     gerd   
18                             heart attack   
19                              hepatitis a   
20           

In [718]:
# Step 1: Explode the symptoms list into individual rows
disease_symptom_list = disease_symptom_list.explode('symptom').reset_index(drop=True)

# Step 2: Remove duplicates to ensure unique disease-symptom pairs
disease_symptom_list = disease_symptom_list.drop_duplicates()

# Display the final DataFrame
print(disease_symptom_list)


                                     disease                  symptom
0    (vertigo) paroymsal  positional vertigo                 vomiting
1    (vertigo) paroymsal  positional vertigo                 headache
2    (vertigo) paroymsal  positional vertigo                   nausea
3    (vertigo) paroymsal  positional vertigo       spinning_movements
4    (vertigo) paroymsal  positional vertigo          loss_of_balance
..                                       ...                      ...
316                           varicose veins                 bruising
317                           varicose veins                  obesity
318                           varicose veins             swollen_legs
319                           varicose veins    swollen_blood_vessels
320                           varicose veins  prominent_veins_on_calf

[321 rows x 2 columns]


In [719]:
disease_symptom_list.to_csv("../data/unique_disease_symptoms_1.csv", index=False)


In [720]:
disease_symptom_list.head(15)

Unnamed: 0,disease,symptom
0,(vertigo) paroymsal positional vertigo,vomiting
1,(vertigo) paroymsal positional vertigo,headache
2,(vertigo) paroymsal positional vertigo,nausea
3,(vertigo) paroymsal positional vertigo,spinning_movements
4,(vertigo) paroymsal positional vertigo,loss_of_balance
5,(vertigo) paroymsal positional vertigo,unsteadiness
6,acne,skin_rash
7,acne,pus_filled_pimples
8,acne,blackheads
9,acne,scurring


In [721]:
disease_symptom_list.shape

(321, 2)

In [722]:
# Checking null values again for 'disease_symptoms' dataset
disease_symptom_list.isnull().sum()

disease    0
symptom    0
dtype: int64

In [723]:
symptom_description.isnull().sum()

disease        0
description    0
dtype: int64

In [724]:
symptom_precaution.isnull().sum()

disease         0
precaution_1    0
precaution_2    0
precaution_3    1
precaution_4    1
dtype: int64

In [725]:
precaution_cols = [col for col in symptom_precaution.columns if col.startswith("precaution_")]
precaution_cols

['precaution_1', 'precaution_2', 'precaution_3', 'precaution_4']

In [726]:
# Melt the DataFrame to gather all precaution columns into one column
melted_precaution_df = symptom_precaution.melt(id_vars=['disease'], value_vars=precaution_cols, var_name='precaution_column', value_name='precautions')

# Drop NaN values to avoid including them in the list
melted_precaution_df = melted_precaution_df.dropna(subset=['precautions'])

# Group by disease and collect unique precautions into a list
disease_precaution_list = melted_precaution_df.groupby('disease')['precautions'].agg(lambda x: list(x.unique())).reset_index()

# Display the result
print(disease_precaution_list)


                                    disease  \
0   (vertigo) paroymsal  positional vertigo   
1                                      acne   
2                                      aids   
3                       alcoholic hepatitis   
4                                   allergy   
5                                 arthritis   
6                          bronchial asthma   
7                      cervical spondylosis   
8                               chicken pox   
9                       chronic cholestasis   
10                              common cold   
11                                   dengue   
12                                 diabetes   
13             dimorphic hemmorhoids(piles)   
14                            drug reaction   
15                         fungal infection   
16                          gastroenteritis   
17                                     gerd   
18                             heart attack   
19                              hepatitis a   
20           

In [727]:
disease_precaution_list.shape

(41, 2)

In [728]:
symptom_severity.isnull().sum()

symptom    0
weight     0
dtype: int64

In [729]:
symptom_severity.duplicated().sum()

np.int64(0)

In [730]:
symptom_severity.head()

Unnamed: 0,symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4
3,continuous_sneezing,4
4,shivering,5


In [731]:
# Check if there are duplicates in symptom_description
symptom_description.duplicated().sum()

np.int64(0)

In [732]:
# Check if there are duplicates in symptom_precaution
symptom_precaution.duplicated().sum()

np.int64(0)

In [733]:
# Step 6: Combine with description and precaution datasets
preprocessed_symptoms_df = disease_symptom_list.merge(
    symptom_description, on="disease", how="left"
)

In [734]:
print(preprocessed_symptoms_df.head())

                                   disease             symptom  \
0  (vertigo) paroymsal  positional vertigo            vomiting   
1  (vertigo) paroymsal  positional vertigo            headache   
2  (vertigo) paroymsal  positional vertigo              nausea   
3  (vertigo) paroymsal  positional vertigo  spinning_movements   
4  (vertigo) paroymsal  positional vertigo     loss_of_balance   

                                         description  
0  Benign paroxysmal positional vertigo (BPPV) is...  
1  Benign paroxysmal positional vertigo (BPPV) is...  
2  Benign paroxysmal positional vertigo (BPPV) is...  
3  Benign paroxysmal positional vertigo (BPPV) is...  
4  Benign paroxysmal positional vertigo (BPPV) is...  


In [735]:
# Step 5: Add symptom weights
preprocessed_symptoms_df = preprocessed_symptoms_df.merge(
    symptom_severity, on="symptom", how="left"
)

In [736]:
# Propagate the correct weight for each symptom (in case of NaN)
# If weight is NaN, fill it with the corresponding non-NaN value for that symptom
#preprocessed_symptoms_df['weight'] = preprocessed_symptoms_df.groupby('symptom')['weight'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))

In [737]:
preprocessed_symptoms_df.head(10)

Unnamed: 0,disease,symptom,description,weight
0,(vertigo) paroymsal positional vertigo,vomiting,Benign paroxysmal positional vertigo (BPPV) is...,5.0
1,(vertigo) paroymsal positional vertigo,headache,Benign paroxysmal positional vertigo (BPPV) is...,3.0
2,(vertigo) paroymsal positional vertigo,nausea,Benign paroxysmal positional vertigo (BPPV) is...,5.0
3,(vertigo) paroymsal positional vertigo,spinning_movements,Benign paroxysmal positional vertigo (BPPV) is...,6.0
4,(vertigo) paroymsal positional vertigo,loss_of_balance,Benign paroxysmal positional vertigo (BPPV) is...,4.0
5,(vertigo) paroymsal positional vertigo,unsteadiness,Benign paroxysmal positional vertigo (BPPV) is...,4.0
6,acne,skin_rash,"Acne vulgaris is the formation of comedones, p...",3.0
7,acne,pus_filled_pimples,"Acne vulgaris is the formation of comedones, p...",2.0
8,acne,blackheads,"Acne vulgaris is the formation of comedones, p...",2.0
9,acne,scurring,"Acne vulgaris is the formation of comedones, p...",2.0


In [738]:
# Calculate the mean weight
mean_weight = round(symptom_severity['weight'].mean(), 2)

#Ensure no NaN values remain for 'weight' column (optional, in case no valid fill method was possible)
preprocessed_symptoms_df['weight'].fillna(mean_weight, inplace=True)  # Filling remaining NaNs with the mean weight

#Ensure no NaN values remain for 'description' column (optional, in case no valid fill method was possible)
preprocessed_symptoms_df['description'].fillna("No Description Available", inplace=True)  # Filling remaining NaNs with the mean weight


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  preprocessed_symptoms_df['weight'].fillna(mean_weight, inplace=True)  # Filling remaining NaNs with the mean weight
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  preprocessed_symptoms_df['description'].fillna("No Description Available", inplace=True)  # Filling remaining NaNs w

In [739]:
preprocessed_symptoms_df.head(10)

Unnamed: 0,disease,symptom,description,weight
0,(vertigo) paroymsal positional vertigo,vomiting,Benign paroxysmal positional vertigo (BPPV) is...,5.0
1,(vertigo) paroymsal positional vertigo,headache,Benign paroxysmal positional vertigo (BPPV) is...,3.0
2,(vertigo) paroymsal positional vertigo,nausea,Benign paroxysmal positional vertigo (BPPV) is...,5.0
3,(vertigo) paroymsal positional vertigo,spinning_movements,Benign paroxysmal positional vertigo (BPPV) is...,6.0
4,(vertigo) paroymsal positional vertigo,loss_of_balance,Benign paroxysmal positional vertigo (BPPV) is...,4.0
5,(vertigo) paroymsal positional vertigo,unsteadiness,Benign paroxysmal positional vertigo (BPPV) is...,4.0
6,acne,skin_rash,"Acne vulgaris is the formation of comedones, p...",3.0
7,acne,pus_filled_pimples,"Acne vulgaris is the formation of comedones, p...",2.0
8,acne,blackheads,"Acne vulgaris is the formation of comedones, p...",2.0
9,acne,scurring,"Acne vulgaris is the formation of comedones, p...",2.0


In [740]:
# Step 6: Add symptom precautions
preprocessed_symptoms_df = preprocessed_symptoms_df.merge(
    disease_precaution_list, on="disease", how="left"
)

In [741]:
# Step 7: Save the cleaned and combined dataset
preprocessed_symptoms_df.to_csv("../data/preprocessed_symptom_dataset.csv", index=False)

print("Combined dataset saved as 'preprocessed_symptom_dataset.csv'")

Combined dataset saved as 'preprocessed_symptom_dataset.csv'
