In [1]:
import pandas as pd
import numpy as np
import re
import os

In [2]:
dataset_path = 'dataset/'
data_content = {}
for file_name in os.listdir(dataset_path):
    df = pd.read_csv(dataset_path+file_name)
    df.columns = df.columns.str.lower()
    for column in df.columns:
        if ~np.issubdtype(df[column].dtype, np.number):
            df[column] = df[column].astype(str).str.lower().str.replace('_',' ')
    data_content[file_name] = df

# CLEANING DATA

In [3]:
dataset_cleaned_path = 'dataset_cleaned/'

## DISEASE NODE

In [4]:
data_content['symptom_description.csv'].head(2)

Unnamed: 0,disease,description
0,drug reaction,an adverse drug reaction (adr) is an injury ca...
1,malaria,an infectious disease caused by protozoan para...


In [5]:
df_disease = data_content['symptom_description.csv'].drop_duplicates()
df_disease.to_csv(dataset_cleaned_path+'disease_node.csv')
print(df_disease.shape)
df_disease.head()

(41, 2)


Unnamed: 0,disease,description
0,drug reaction,an adverse drug reaction (adr) is an injury ca...
1,malaria,an infectious disease caused by protozoan para...
2,allergy,an allergy is an immune system response to a f...
3,hypothyroidism,"hypothyroidism, also called underactive thyroi..."
4,psoriasis,psoriasis is a common skin disorder that forms...


## PRECAUTION NODE

In [6]:
data_content['symptom_precaution.csv'].head(2)

Unnamed: 0,disease,precaution_1,precaution_2,precaution_3,precaution_4
0,drug reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,malaria,consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out


In [21]:
precaution_columns = [x for x in data_content['symptom_precaution.csv'] if 'precaution' in x]
df_precaution_list = []
for column in precaution_columns:
    df = pd.DataFrame()
    df['precaution'] = data_content['symptom_precaution.csv'][column]
    df['weight'] = int(column.replace('precaution_', ''))
    df_precaution_list.append(df)
df_precaution = pd.concat(df_precaution_list, ignore_index=True)
df_precaution.drop(df_precaution[df_precaution['precaution'] == 'nan'].index, inplace=True)
df_precaution = df_precaution.groupby(by=['precaution'], as_index=False)\
                                .agg('count')\
                                .sort_values(by=['weight'], ascending=False)\
                                .reset_index(drop=True)
print(df_precaution.shape)
df_precaution.head()


(96, 2)


Unnamed: 0,precaution,weight
0,consult doctor,15
1,medication,10
2,eat healthy,7
3,follow up,7
4,avoid fatty spicy food,6


# SYMPTOM NODE

In [8]:
data_content['symptom_severity.csv'].head(2)

Unnamed: 0,symptom,weight
0,itching,1
1,skin rash,3


In [9]:
df_symptom = data_content['symptom_severity.csv'].drop_duplicates()
print(df_symptom.shape)
df_symptom.head()

(133, 2)


Unnamed: 0,symptom,weight
0,itching,1
1,skin rash,3
2,nodal skin eruptions,4
3,continuous sneezing,4
4,shivering,5


## PRECAUTION AND SYMPTOM CONNECTION

In [14]:
data_content['symptom_precaution.csv'].head(2)

Unnamed: 0,disease,precaution_1,precaution_2,precaution_3,precaution_4
0,drug reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,malaria,consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out


In [27]:
precaution_columns = [x for x in data_content['symptom_precaution.csv'].columns if x != 'disease']
df_precaution_and_symptom_list = []
for column in precaution_columns:
    df = data_content['symptom_precaution.csv'][['disease', column]].copy()
    df = df.rename(columns={column : 'precaution'})
    df_precaution_and_symptom_list.append(df)

df_precaution_and_symptom = pd.concat(df_precaution_and_symptom_list, ignore_index=True)
df_precaution_and_symptom.drop(df_precaution_and_symptom[df_precaution_and_symptom['precaution'] == 'nan'].index, inplace=True)
df_precaution_and_symptom.sort_values(by=['disease'], ascending=True, inplace=True, ignore_index=True)
print(df_precaution_and_symptom.shape)
df_precaution_and_symptom.head()

(162, 2)


Unnamed: 0,disease,precaution
0,(vertigo) paroymsal positional vertigo,avoid abrupt head movment
1,(vertigo) paroymsal positional vertigo,avoid sudden change in body
2,(vertigo) paroymsal positional vertigo,relax
3,(vertigo) paroymsal positional vertigo,lie down
4,acne,avoid too many products


## DISEASE AND SYMPTOM CONNECTION

In [53]:
data_content['disease_symptom.csv'].head(2)

Unnamed: 0,disease,symptom_1,symptom_2,symptom_3,symptom_4,symptom_5,symptom_6,symptom_7,symptom_8,symptom_9,symptom_10,symptom_11,symptom_12,symptom_13,symptom_14,symptom_15,symptom_16,symptom_17
0,fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,
1,fungal infection,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,


In [2]:
data_content['disease_symptom.csv']

NameError: name 'data_content' is not defined