## Data Cleaning and Transformation with Diseases & Symptom Dataset

In [1]:
import pandas as pd
import numpy as np

#### loading data 

In [2]:
data = pd.read_csv('datasets/dataset.csv')
severity = pd.read_csv('datasets/Symptom-severity.csv')
descriptions = pd.read_csv('datasets/symptom_Description.csv')
precautions = pd.read_csv('datasets/symptom_precaution.csv')

#### shape and columns of each dataset

In [3]:
print(data.columns)
print(data.shape)

Index(['Disease', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4',
       'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
       'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14',
       'Symptom_15', 'Symptom_16', 'Symptom_17'],
      dtype='object')
(4920, 18)


In [4]:
print(severity.columns)
print(severity.shape)

Index(['Symptom', 'weight'], dtype='object')
(133, 2)


In [5]:
print(descriptions.columns)
print(descriptions.shape)

Index(['Disease', 'Description'], dtype='object')
(41, 2)


In [6]:
print(precautions.columns)
print(precautions.shape)

Index(['Disease', 'Precaution_1', 'Precaution_2', 'Precaution_3',
       'Precaution_4'],
      dtype='object')
(41, 5)


#### viewing data

In [7]:
data.head(3)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,


In [8]:
severity.head(3)

Unnamed: 0,Symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4


In [9]:
descriptions.head(3)

Unnamed: 0,Disease,Description
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...
1,Malaria,An infectious disease caused by protozoan para...
2,Allergy,An allergy is an immune system response to a f...


In [10]:
precautions.head(3)

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching


#### summary statistics

In [11]:
data.describe()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
count,4920,4920,4920,4920,4572,3714,2934,2268,1944,1692,1512,1194,744,504,306,240,192,72
unique,41,34,48,54,50,38,32,26,21,22,21,18,11,8,4,3,3,1
top,Fungal infection,vomiting,vomiting,fatigue,high_fever,headache,nausea,abdominal_pain,abdominal_pain,yellowing_of_eyes,yellowing_of_eyes,irritability,malaise,muscle_pain,chest_pain,chest_pain,blood_in_sputum,muscle_pain
freq,120,822,870,726,378,348,390,264,276,228,198,120,126,72,96,144,72,72


In [12]:
severity.describe()

Unnamed: 0,weight
count,133.0
mean,4.225564
std,1.323543
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,7.0


descriptions.describe()

In [13]:
precautions.describe()

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
count,41,41,41,40,40
unique,41,32,34,30,24
top,Drug Reaction,Consult nearest hospital,exercise,consult doctor,follow up
freq,1,3,3,6,6


### Data Transformation
* combining all datasets into one

#### repalcing null values with 0

In [14]:
data.fillna(0, inplace=True)
data.head(2)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### adding all symptoms to a single list

In [15]:
ordered_symptoms = []

for i in range(len(data['Disease'])):
    temp_list = []
    for k in range(1,17):
        if data.iloc[i][k] == 0:
            break
        temp_list.append(data.iloc[i][k])
    ordered_symptoms.append(temp_list)

  if data.iloc[i][k] == 0:
  temp_list.append(data.iloc[i][k])


#### capitalize Diseases

In [16]:
data['Disease']=data['Disease'].str.capitalize()

#### Using sorting methods to sort all columns by disease

In [18]:
# lit of all diseases alphabetically

diseases = sorted(data['Disease'].unique())

In [19]:
# sorting columns of dataset
descs = descriptions.sort_values(by='Disease')
pre_c = precautions.sort_values(by='Disease')

#### adding all precautions to one list

In [20]:
ordered_cautions = []
for i in range(len(pre_c['Disease'])):
    temp_list = []
    for k in range(1,5):
        temp_list.append(pre_c.iloc[i][k])
    ordered_cautions.append(temp_list)


  temp_list.append(pre_c.iloc[i][k])


#### creating a dictionary with diesease and symptoms

In [21]:
#Dictionary to hold disease (keys) and its values (a list of the symptoms)
disease_dict = {}

for i in range(len(data['Disease'])):
    symptoms_list = []
    for k in range(len(data.columns)):
        if data.iloc[i][k] == 0 or data.iloc[i][k] in disease_dict.keys():
            continue
        symptoms_list.append(data.iloc[i][k])
    disease_dict[data['Disease'][i]] = symptoms_list


  if data.iloc[i][k] == 0 or data.iloc[i][k] in disease_dict.keys():
  symptoms_list.append(data.iloc[i][k])


#### sorting disease and symptom

In [22]:
# sorting diseases
sorted_keys = sorted(disease_dict.keys())

In [23]:
# sorting symptoms in accordance with key(disease)
symptoms_list = []
for i in range(len(sorted_keys)):
    symptoms_list.append(disease_dict[sorted_keys[i]])

#### Assembling everything into a single dataframe

In [24]:
df = pd.DataFrame({"Diseases":diseases,"Descriptions":descs['Description'],
"Precautions":ordered_cautions, "Symptoms":symptoms_list})

#### setting index values

In [25]:
index=np.arange(1,len(df)+1)
df.set_index(index, inplace=True)

### End Result

In [26]:
df

Unnamed: 0,Diseases,Descriptions,Precautions,Symptoms
1,(vertigo) paroymsal positional vertigo,Benign paroxysmal positional vertigo (BPPV) is...,"[lie down, avoid sudden change in body, avoid ...","[ vomiting, headache, nausea, spinning_move..."
2,Acne,Acquired immunodeficiency syndrome (AIDS) is a...,"[avoid open cuts, wear ppe if possible, consul...","[ skin_rash, pus_filled_pimples, blackheads,..."
3,Aids,"Acne vulgaris is the formation of comedones, p...","[bath twice, avoid fatty spicy food, drink ple...","[ muscle_wasting, patches_in_throat, high_fe..."
4,Alcoholic hepatitis,"Alcoholic hepatitis is a diseased, inflammator...","[stop alcohol consumption, consult doctor, med...","[ vomiting, yellowish_skin, abdominal_pain, ..."
5,Allergy,An allergy is an immune system response to a f...,"[apply calamine, cover area with bandage, nan,...","[ continuous_sneezing, shivering, chills, w..."
6,Arthritis,Arthritis is the swelling and tenderness of on...,"[exercise, use hot and cold therapy, try acupu...","[ muscle_weakness, stiff_neck, swelling_join..."
7,Bronchial asthma,Bronchial asthma is a medical condition which ...,"[switch to loose cloothing, take deep breaths,...","[ fatigue, cough, high_fever, breathlessnes..."
8,Cervical spondylosis,Cervical spondylosis is a general term for age...,"[use heating pad or cold pack, exercise, take ...","[ back_pain, weakness_in_limbs, neck_pain, ..."
9,Chicken pox,Chickenpox is a highly contagious disease caus...,"[use neem in bathing , consume neem leaves, ta...","[itching, skin_rash, fatigue, lethargy, hi..."
10,Chronic cholestasis,"Chronic cholestatic diseases, whether occurrin...","[cold baths, anti itch medicine, consult docto...","[itching, vomiting, yellowish_skin, nausea,..."
