In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

# Data Exploration

In [3]:
# load dataset
df_dirty = pd.read_csv('data/symptom_data.csv', sep=',')
df_dirty.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Tick fever,Fever,Nasal Discharge,Lameness,Lethargy,Increased drinking and urination,Neurological Disorders,,,,,,,,,,,
1,Tick fever,Fever,Lameness,Swollen Lymph nodes,Vomiting,Neurological Disorders,,,,,,,,,,,,
2,Tick fever,Fever,Nasal Discharge,Lethargy,Swollen Lymph nodes,,,,,,,,,,,,,
3,Tick fever,Fever,Nasal Discharge,Lameness,Vomiting,Neurological Disorders,,,,,,,,,,,,
4,Tick fever,Nasal Discharge,Weight Loss,Breathing Difficulty,Heart Complication,Vomiting,,,,,,,,,,,,


In [4]:
# dataset info
df_dirty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23999 entries, 0 to 23998
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Disease     23999 non-null  object 
 1   Symptom_1   23999 non-null  object 
 2   Symptom_2   23999 non-null  object 
 3   Symptom_3   23999 non-null  object 
 4   Symptom_4   23863 non-null  object 
 5   Symptom_5   18856 non-null  object 
 6   Symptom_6   10658 non-null  object 
 7   Symptom_7   506 non-null    object 
 8   Symptom_8   0 non-null      float64
 9   Symptom_9   0 non-null      float64
 10  Symptom_10  0 non-null      float64
 11  Symptom_11  0 non-null      float64
 12  Symptom_12  0 non-null      float64
 13  Symptom_13  0 non-null      float64
 14  Symptom_14  0 non-null      float64
 15  Symptom_15  0 non-null      float64
 16  Symptom_16  0 non-null      float64
 17  Symptom_17  0 non-null      float64
dtypes: float64(10), object(8)
memory usage: 3.3+ MB


In [5]:
# what are types of diseases available there?
dis = df_dirty['Disease'].unique()
dis

array(['Tick fever', 'Distemper', 'Parvovirus', 'Hepatitis ', 'Tetanus ',
       'Chronic kidney Disease ', 'Diabetes', 'Gastrointestinal Disease',
       'Allergies', 'Gingitivis', 'Cancers', 'Skin Rashes'], dtype=object)

In [6]:
# how many disctinct diseases are there?
len(dis)

12

In [7]:
# what are types of distinct symptoms available there?
sym = pd.unique(df_dirty[['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7']].values.ravel())
sym

array(['Fever', 'Nasal Discharge', 'Lameness', 'Lethargy',
       'Increased drinking and urination', 'Neurological Disorders', nan,
       'Swollen Lymph nodes', 'Vomiting', 'Weight Loss',
       'Breathing Difficulty', 'Heart Complication', 'Loss of appetite',
       'Depression', 'Eating less than usual', 'Seizures', 'Paralysis',
       'Coughing', 'Diarrhea', 'Excessive Salivation', 'Eye Discharge',
       'Weakness', 'Discomfort', 'Redness around Eye area', 'Sepsis',
       'Anorexia', 'Severe Dehydration', 'Pain', 'Tender abdomen',
       'Bloated Stomach', 'Yellow gums', 'Loss of Consciousness',
       'Blindness', 'WeightLoss', 'Aggression', 'Constipation',
       'Wrinkled forehead', 'Continuously erect and stiff ears',
       'Grinning appearance', 'Stiff and hard tail',
       'Stiffness of muscles', 'excess jaw tone', 'Blood in urine',
       'Bad breath', 'Acute blindness', 'Pale gums', 'Urine infection',
       'Hunger', 'Cataracts', 'Glucose in urine', 'Enlarged Liver',


In [8]:
# how many distinct symptoms are there?
len(sym)
# nan included here, hence the actual number of symptoms is 86

87

# Data Preprocessing

In [9]:
df_dirty.fillna(value='missing', inplace=True)
df_dirty

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Tick fever,Fever,Nasal Discharge,Lameness,Lethargy,Increased drinking and urination,Neurological Disorders,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing
1,Tick fever,Fever,Lameness,Swollen Lymph nodes,Vomiting,Neurological Disorders,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing
2,Tick fever,Fever,Nasal Discharge,Lethargy,Swollen Lymph nodes,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing
3,Tick fever,Fever,Nasal Discharge,Lameness,Vomiting,Neurological Disorders,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing
4,Tick fever,Nasal Discharge,Weight Loss,Breathing Difficulty,Heart Complication,Vomiting,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23994,Skin Rashes,Redness of skin,Scabs,Dry Skin,Red bumps,Red patches,Dandruff,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing
23995,Skin Rashes,Redness of skin,Dry Skin,Fur loss,Red patches,Dandruff,Smelly,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing
23996,Skin Rashes,Itchy skin,Irritation,Red bumps,Red patches,Dandruff,Wounds,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing
23997,Skin Rashes,Itchy skin,Scabs,Fur loss,Red patches,Smelly,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing


In [10]:
# drop unused columns: 8-17
to_drop = df_dirty[['Disease', 'Symptom_8',	'Symptom_9', 'Symptom_10',	'Symptom_11',	'Symptom_12',	'Symptom_13',	'Symptom_14',	'Symptom_15',	'Symptom_16',	'Symptom_17']]

In [11]:
# encoding
cols = df_dirty.drop(to_drop, axis=1)
encoder = MultiLabelBinarizer() # it works on multiple columns
encoder.fit_transform(cols)

array([[1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])

In [12]:
encoder.classes_

array(['1', '2', '3', '4', '5', '6', '7', 'S', '_', 'm', 'o', 'p', 't',
       'y'], dtype=object)

In [13]:
proc = pd.get_dummies(cols)
proc

Unnamed: 0,Symptom_1_Acute blindness,Symptom_1_Bad breath,Symptom_1_Bleeding of gum,Symptom_1_Bloated Stomach,Symptom_1_Blood in urine,Symptom_1_Breathing Difficulty,Symptom_1_Burping,Symptom_1_Cataracts,Symptom_1_Constipation,Symptom_1_Continuously erect and stiff ears,...,Symptom_6_Yellow gums,Symptom_6_excess jaw tone,Symptom_6_lethargy,Symptom_6_missing,Symptom_7_Grinning appearance,Symptom_7_Lethargy,Symptom_7_Stiff and hard tail,Symptom_7_Stiffness of muscles,Symptom_7_excess jaw tone,Symptom_7_missing
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
23995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
23996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
23997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [14]:
proc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23999 entries, 0 to 23998
Columns: 420 entries, Symptom_1_Acute blindness to Symptom_7_missing
dtypes: uint8(420)
memory usage: 9.6 MB


In [15]:
symptoms = []

for col in df_dirty.columns[1:]:
  symptoms.append(df_dirty[col].unique()) #unique symtomps in respective columns

symptoms

[array(['Fever', 'Nasal Discharge', 'Loss of appetite', 'Weight Loss',
        'Lameness', 'Breathing Difficulty', 'Swollen Lymph nodes',
        'Lethargy', 'Depression', 'Coughing', 'Diarrhea', 'Seizures',
        'Vomiting', 'Eating less than usual', 'Excessive Salivation',
        'Redness around Eye area', 'Severe Dehydration', 'Pain',
        'Discomfort', 'Sepsis', 'WeightLoss', 'Tender abdomen',
        'Increased drinking and urination', 'Bloated Stomach',
        'Yellow gums', 'Constipation', 'Paralysis', 'Wrinkled forehead',
        'Continuously erect and stiff ears', 'Grinning appearance',
        'Stiff and hard tail', 'Stiffness of muscles', 'Acute blindness',
        'Blood in urine', 'Hunger', 'Cataracts', 'Losing sight',
        'Glucose in urine', 'Burping', 'blood in stools', 'Passing gases',
        'Eating grass', 'Scratching', 'Licking', 'Itchy skin',
        'Redness of skin', 'Face rubbing', 'Loss of Fur',
        'Swelling of gum', 'Redness of gum', 'Receding

In [16]:
unique_sym = [] # prepare a list to store all unique symptoms

for row in symptoms:
    for symp in row:
        if (symp not in unique_sym) and (symp != 'missing'):
          unique_sym.append(symp.strip())
        else:
          print('already exist', end='\r')

already existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalready existalread

In [17]:
unique_sym
# if you haven't replaced nan value in df_dirty with 'missing' you won't be able to use symp.strip()
# as result, function above won't work properly, nan still included

['Fever',
 'Nasal Discharge',
 'Loss of appetite',
 'Weight Loss',
 'Lameness',
 'Breathing Difficulty',
 'Swollen Lymph nodes',
 'Lethargy',
 'Depression',
 'Coughing',
 'Diarrhea',
 'Seizures',
 'Vomiting',
 'Eating less than usual',
 'Excessive Salivation',
 'Redness around Eye area',
 'Severe Dehydration',
 'Pain',
 'Discomfort',
 'Sepsis',
 'WeightLoss',
 'Tender abdomen',
 'Increased drinking and urination',
 'Bloated Stomach',
 'Yellow gums',
 'Constipation',
 'Paralysis',
 'Wrinkled forehead',
 'Continuously erect and stiff ears',
 'Grinning appearance',
 'Stiff and hard tail',
 'Stiffness of muscles',
 'Acute blindness',
 'Blood in urine',
 'Hunger',
 'Cataracts',
 'Losing sight',
 'Glucose in urine',
 'Burping',
 'blood in stools',
 'Passing gases',
 'Eating grass',
 'Scratching',
 'Licking',
 'Itchy skin',
 'Redness of skin',
 'Face rubbing',
 'Loss of Fur',
 'Swelling of gum',
 'Redness of gum',
 'Receding gum',
 'Bleeding of gum',
 'Plaque',
 'Bad breath',
 'Tartar',
 'Lum

In [18]:
len(unique_sym) # 'missing' value has gone, it's correct we have 86 unique symptoms

86

In [19]:
values = [0.0] * 86
symp_dict = dict(zip(unique_sym, values)) # create dictionary to store unique symptoms and its value
symp_dict

{'Fever': 0.0,
 'Nasal Discharge': 0.0,
 'Loss of appetite': 0.0,
 'Weight Loss': 0.0,
 'Lameness': 0.0,
 'Breathing Difficulty': 0.0,
 'Swollen Lymph nodes': 0.0,
 'Lethargy': 0.0,
 'Depression': 0.0,
 'Coughing': 0.0,
 'Diarrhea': 0.0,
 'Seizures': 0.0,
 'Vomiting': 0.0,
 'Eating less than usual': 0.0,
 'Excessive Salivation': 0.0,
 'Redness around Eye area': 0.0,
 'Severe Dehydration': 0.0,
 'Pain': 0.0,
 'Discomfort': 0.0,
 'Sepsis': 0.0,
 'WeightLoss': 0.0,
 'Tender abdomen': 0.0,
 'Increased drinking and urination': 0.0,
 'Bloated Stomach': 0.0,
 'Yellow gums': 0.0,
 'Constipation': 0.0,
 'Paralysis': 0.0,
 'Wrinkled forehead': 0.0,
 'Continuously erect and stiff ears': 0.0,
 'Grinning appearance': 0.0,
 'Stiff and hard tail': 0.0,
 'Stiffness of muscles': 0.0,
 'Acute blindness': 0.0,
 'Blood in urine': 0.0,
 'Hunger': 0.0,
 'Cataracts': 0.0,
 'Losing sight': 0.0,
 'Glucose in urine': 0.0,
 'Burping': 0.0,
 'blood in stools': 0.0,
 'Passing gases': 0.0,
 'Eating grass': 0.0,
 'S

In [20]:
len({key: 0 for key in symp_dict})

86

In [21]:
encoded_result = []

for i in range(len(df_dirty)):
    row = df_dirty.iloc[i].values
    temp = dict(zip(unique_sym, values))
    for i in row:
        if i != 'missing':
            temp[i.strip()] = 1.0 # fill dictionary value with 1 if it matches the respective column
            
    encoded_result.append(temp)

In [22]:
pre_process = pd.DataFrame(data=encoded_result) # store dictionary to data frame format
pre_process['Disease'] = df_dirty['Disease']
pre_process.fillna(value=0, inplace=True)
pre_process.head()

Unnamed: 0,Fever,Nasal Discharge,Loss of appetite,Weight Loss,Lameness,Breathing Difficulty,Swollen Lymph nodes,Lethargy,Depression,Coughing,...,Hepatitis,Tetanus,Chronic kidney Disease,Diabetes,Gastrointestinal Disease,Allergies,Gingitivis,Cancers,Skin Rashes,Disease
0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tick fever
1,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tick fever
2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tick fever
3,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tick fever
4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tick fever


In [23]:
pre_process.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23999 entries, 0 to 23998
Data columns (total 99 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Fever                              23999 non-null  float64
 1   Nasal Discharge                    23999 non-null  float64
 2   Loss of appetite                   23999 non-null  float64
 3   Weight Loss                        23999 non-null  float64
 4   Lameness                           23999 non-null  float64
 5   Breathing Difficulty               23999 non-null  float64
 6   Swollen Lymph nodes                23999 non-null  float64
 7   Lethargy                           23999 non-null  float64
 8   Depression                         23999 non-null  float64
 9   Coughing                           23999 non-null  float64
 10  Diarrhea                           23999 non-null  float64
 11  Seizures                           23999 non-null  flo

In [24]:
pre_process.isnull().sum()

Fever               0
Nasal Discharge     0
Loss of appetite    0
Weight Loss         0
Lameness            0
                   ..
Allergies           0
Gingitivis          0
Cancers             0
Skin Rashes         0
Disease             0
Length: 99, dtype: int64

In [25]:
# import pre_process df to new csv
pre_process.to_csv('data/pre_processed.csv')