In [28]:
import pandas as pd
import numpy as np
import re
import os

In [29]:
dataset_path = 'dataset/'
data_content = {}
for file_name in os.listdir(dataset_path):
    if file_name != 'users':
        df = pd.read_csv(dataset_path+file_name)
        df.columns = df.columns.str.lower()
        for column in df.columns:
            if ~np.issubdtype(df[column].dtype, np.number):
                df[column] = df[column].astype(str).str.lower().str.replace('_',' ')
        data_content[file_name] = df

In [30]:
user_data = {}
users_path = dataset_path+'users/'
df_user_list = []
df = pd.DataFrame()
for file_name in os.listdir(users_path):
    df = pd.read_csv(users_path+file_name)    
    df_user_list.append(df)
    

# CLEANING DATA

In [31]:
dataset_cleaned_path = 'dataset_cleaned/'

## USER NODE

In [32]:
df_user = pd.concat(df_user_list, ignore_index=True)
df_user = df_user.drop_duplicates().reset_index(drop=True)
df_user = df_user.drop_duplicates(subset='name', keep="first")
df_user = df_user.iloc[0:4920, :]
df_user.rename(columns={'name':'user'},inplace=True)
df_user.to_csv(dataset_cleaned_path+'user_node.csv')
print(df_user.shape)
df_user.head(2)

(4920, 5)


Unnamed: 0,user,email,postalZip,region,country
0,Dorian Lowery,dictum@protonmail.net,29848,Picardie,India
1,Nash Miranda,mi.lacinia@icloud.couk,184226,North Jeolla,Italy


## DISEASE NODE

In [33]:
data_content['symptom_description.csv'].head(2)

Unnamed: 0,disease,description
0,drug reaction,an adverse drug reaction (adr) is an injury ca...
1,malaria,an infectious disease caused by protozoan para...


In [34]:
df_disease = data_content['symptom_description.csv'].drop_duplicates()
df_disease.to_csv(dataset_cleaned_path+'disease_node.csv')
print(df_disease.shape)
df_disease.head()

(41, 2)


Unnamed: 0,disease,description
0,drug reaction,an adverse drug reaction (adr) is an injury ca...
1,malaria,an infectious disease caused by protozoan para...
2,allergy,an allergy is an immune system response to a f...
3,hypothyroidism,"hypothyroidism, also called underactive thyroi..."
4,psoriasis,psoriasis is a common skin disorder that forms...


## PRECAUTION NODE

In [35]:
data_content['symptom_precaution.csv'].head(2)

Unnamed: 0,disease,precaution_1,precaution_2,precaution_3,precaution_4
0,drug reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,malaria,consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out


In [36]:
precaution_columns = [x for x in data_content['symptom_precaution.csv'] if 'precaution' in x]
df_precaution_list = []
for column in precaution_columns:
    df = pd.DataFrame()
    df['precaution'] = data_content['symptom_precaution.csv'][column]
    df['weight'] = int(column.replace('precaution_', ''))
    df_precaution_list.append(df)
df_precaution = pd.concat(df_precaution_list, ignore_index=True)
df_precaution.drop(df_precaution[df_precaution['precaution'] == 'nan'].index, inplace=True)
df_precaution = df_precaution.groupby(by=['precaution'], as_index=False)\
                                .agg('count')\
                                .sort_values(by=['weight'], ascending=False)\
                                .reset_index(drop=True)
df_precaution.to_csv(dataset_cleaned_path+'precaution_node.csv')
print(df_precaution.shape)
df_precaution.head()


(96, 2)


Unnamed: 0,precaution,weight
0,consult doctor,15
1,medication,10
2,eat healthy,7
3,follow up,7
4,avoid fatty spicy food,6


# SYMPTOM NODE

In [37]:
data_content['symptom_severity.csv'].head(2)

Unnamed: 0,symptom,weight
0,itching,1
1,skin rash,3


In [38]:
df_symptom = data_content['symptom_severity.csv'].drop_duplicates()
df_symptom.to_csv(dataset_cleaned_path+'symptom_node.csv')
print(df_symptom.shape)
df_symptom.head()

(133, 2)


Unnamed: 0,symptom,weight
0,itching,1
1,skin rash,3
2,nodal skin eruptions,4
3,continuous sneezing,4
4,shivering,5


## PRECAUTION AND SYMPTOM CONNECTION

In [39]:
data_content['symptom_precaution.csv'].head(2)

Unnamed: 0,disease,precaution_1,precaution_2,precaution_3,precaution_4
0,drug reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,malaria,consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out


In [40]:
precaution_columns = [x for x in data_content['symptom_precaution.csv'].columns if x != 'disease']
df_precaution_and_symptom_list = []
for column in precaution_columns:
    df = data_content['symptom_precaution.csv'][['disease', column]].copy()
    df = df.rename(columns={column : 'precaution'})
    df_precaution_and_symptom_list.append(df)

df_precaution_and_symptom = pd.concat(df_precaution_and_symptom_list, ignore_index=True)
df_precaution_and_symptom.drop(df_precaution_and_symptom[df_precaution_and_symptom['precaution'] == 'nan'].index, inplace=True)
df_precaution_and_symptom.sort_values(by=['disease'], ascending=True, inplace=True, ignore_index=True)
df_precaution_and_symptom.to_csv(dataset_cleaned_path+'precaution_and_symptom_node.csv')
print(df_precaution_and_symptom.shape)
df_precaution_and_symptom.head()

(162, 2)


Unnamed: 0,disease,precaution
0,(vertigo) paroymsal positional vertigo,avoid abrupt head movment
1,(vertigo) paroymsal positional vertigo,avoid sudden change in body
2,(vertigo) paroymsal positional vertigo,relax
3,(vertigo) paroymsal positional vertigo,lie down
4,acne,avoid too many products


## DISEASE AND SYMPTOM CONNECTION

In [41]:
data_content['disease_symptom.csv'].head(2)

Unnamed: 0,disease,symptom_1,symptom_2,symptom_3,symptom_4,symptom_5,symptom_6,symptom_7,symptom_8,symptom_9,symptom_10,symptom_11,symptom_12,symptom_13,symptom_14,symptom_15,symptom_16,symptom_17
0,fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,
1,fungal infection,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,


In [42]:
df_disease_and_symptom = data_content['disease_symptom.csv']
symptom_columns = [x for x in df_disease_and_symptom.columns if x != 'disease']
df_disease_and_symptom_and_user = df_disease_and_symptom.join(df_user[['user']], how='outer')
df_disease_and_symptom_and_user = df_disease_and_symptom_and_user[['disease', 'user'] + symptom_columns].copy()
df_disease_and_symptom_and_user.replace({'nan' : np.nan}, inplace=True)
df_disease_and_symptom_and_user.to_csv(dataset_cleaned_path+'disease_and_symptom_and_user_node.csv')
print(df_disease_and_symptom_and_user.shape)
df_disease_and_symptom_and_user.head()

(4927, 19)


Unnamed: 0,disease,user,symptom_1,symptom_2,symptom_3,symptom_4,symptom_5,symptom_6,symptom_7,symptom_8,symptom_9,symptom_10,symptom_11,symptom_12,symptom_13,symptom_14,symptom_15,symptom_16,symptom_17
0,fungal infection,Dorian Lowery,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,
1,fungal infection,Nash Miranda,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
2,fungal infection,Sopoline Mcintyre,itching,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
3,fungal infection,Ray Strong,itching,skin rash,dischromic patches,,,,,,,,,,,,,,
4,fungal infection,Lamar Love,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,


## DISEASE AND USER RELATIONSHIP

In [43]:
df_disease_and_user = df_disease_and_symptom_and_user[['disease', 'user']].copy()
df_disease_and_user.dropna(inplace=True)
df_disease_and_user = df_disease_and_user.drop_duplicates().reset_index(drop=True)
df_disease_and_user.to_csv(dataset_cleaned_path+'disease_and_user_node.csv')
print(df_disease_and_user.shape)
df_disease_and_user.head()

(4913, 2)


Unnamed: 0,disease,user
0,fungal infection,Dorian Lowery
1,fungal infection,Nash Miranda
2,fungal infection,Sopoline Mcintyre
3,fungal infection,Ray Strong
4,fungal infection,Lamar Love


## SYMPTOM AND USER RELATIONSHIP

In [44]:
df_user_and_symptom = df_disease_and_symptom_and_user[['user'] + symptom_columns]
df_user_and_symptom_list = []
for column in symptom_columns:
    df = df_user_and_symptom[['user', column]].copy()
    df.rename(columns={column : 'symptom'}, inplace=True)
    df_user_and_symptom_list.append(df)

df_user_and_symptom = pd.concat(df_user_and_symptom_list)
df_user_and_symptom.sort_values(by=['user'], inplace=True)
df_user_and_symptom.dropna(inplace=True)
df_user_and_symptom.reset_index(drop=True, inplace=True)
df_user_and_symptom['symptom'] = df_user_and_symptom['symptom'].str.strip()
df_user_and_symptom.to_csv(dataset_cleaned_path+'user_and_symptom_node.csv')
print(df_user_and_symptom.shape)
df_user_and_symptom.head(2)

(36594, 2)


Unnamed: 0,user,symptom
0,Aaron Munoz,sweating
1,Aaron Munoz,weight loss


## SYMPTOM AND DISEASE RELATIONSHIP

In [46]:
df_disease_and_symptom = df_disease_and_symptom_and_user[['disease'] + symptom_columns]
df_disease_and_symptom_list = []
for column in symptom_columns:
    df = df_disease_and_symptom[['disease', column]].copy()
    df.rename(columns={column : 'symptom'}, inplace=True)
    df_disease_and_symptom_list.append(df)

df_disease_and_symptom = pd.concat(df_disease_and_symptom_list)
df_disease_and_symptom.sort_values(by=['disease'], inplace=True)
df_disease_and_symptom.dropna(inplace=True)
df_disease_and_symptom.drop_duplicates(inplace=True)
df_disease_and_symptom.reset_index(drop=True, inplace=True)
df_disease_and_symptom['symptom'] = df_disease_and_symptom['symptom'].str.strip()
df_disease_and_symptom.to_csv(dataset_cleaned_path+'disease_and_symptom_node.csv')
print(df_disease_and_symptom.shape)
df_disease_and_symptom.head(2)

(321, 2)


Unnamed: 0,disease,symptom
0,(vertigo) paroymsal positional vertigo,nausea
1,(vertigo) paroymsal positional vertigo,spinning movements
