#Import Packages

In [None]:
import re 
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow import keras

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Read Data

## Symptoms

In [None]:
symptoms = pd.read_csv('/content/drive/MyDrive/CapstoneProjectBangkitMachineLearning/Data/dataset.csv')
symptoms.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [None]:
symptoms.shape

(4920, 18)

In [None]:
# Null values
symptoms.isnull().sum()

Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64

In [None]:
# Count duplicated data
symptoms.duplicated().sum()

4616

## Symptom Severity

In [None]:
severity = pd.read_csv('/content/drive/MyDrive/CapstoneProjectBangkitMachineLearning/Data/Symptom-severity.csv')
severity.head()

Unnamed: 0,Symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4
3,continuous_sneezing,4
4,shivering,5


## Disease Description

In [None]:
description = pd.read_csv('/content/drive/MyDrive/CapstoneProjectBangkitMachineLearning/Data/symptom_Description.csv')
description.head()

Unnamed: 0,Disease,Description
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...
1,Malaria,An infectious disease caused by protozoan para...
2,Allergy,An allergy is an immune system response to a f...
3,Hypothyroidism,"Hypothyroidism, also called underactive thyroi..."
4,Psoriasis,Psoriasis is a common skin disorder that forms...


In [None]:
description.shape

(41, 2)

## Disease Precaution

In [None]:
precaution = pd.read_csv('/content/drive/MyDrive/CapstoneProjectBangkitMachineLearning/Data/symptom_precaution.csv')
precaution.head()

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths


# Data Cleaning

In [None]:
# Mengganti spaces dengan "_" dan menghapus spaces di bagian kiri kanan string
def substitute_spaces(string):
  if isinstance(string, str):
    string = string.strip()
    #string = re.sub(" ","_",string)
  return string

In [None]:
for cols in symptoms.columns:
  symptoms[cols] = symptoms[cols].apply(substitute_spaces)
for cols in severity.columns:
  severity[cols] = severity[cols].apply(substitute_spaces)

In [None]:
symptoms.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [None]:
severity.head()

Unnamed: 0,Symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4
3,continuous_sneezing,4
4,shivering,5


In [None]:
# Mencocokkan dengan data pada symptom_severity
def matched_data(values):
  values = severity.loc[severity['Symptom'] == values]['weight']
  if len(values) > 0:
    return values.iloc[0]  # Return the first matching value
  else:
    return 0

In [None]:
cols = ['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 
        'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9', 'Symptom_10', 
        'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14', 'Symptom_15', 
        'Symptom_16', 'Symptom_17']

for i in cols:
  symptoms[i] = symptoms[i].apply(matched_data)

In [None]:
symptoms

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,5,3,5,6,4,4,0,0,0,0,0,0,0,0,0,0,0
4916,Acne,3,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0
4917,Urinary tract infection,6,4,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0
4918,Psoriasis,3,3,3,2,2,2,0,0,0,0,0,0,0,0,0,0,0


In [None]:
symptoms.dtypes

Disease       object
Symptom_1      int64
Symptom_2      int64
Symptom_3      int64
Symptom_4      int64
Symptom_5      int64
Symptom_6      int64
Symptom_7      int64
Symptom_8      int64
Symptom_9      int64
Symptom_10     int64
Symptom_11     int64
Symptom_12     int64
Symptom_13     int64
Symptom_14     int64
Symptom_15     int64
Symptom_16     int64
Symptom_17     int64
dtype: object

# Modeling

## Bagi data training dan test

In [None]:
data = symptoms.copy()

In [None]:
# Bagi data train dan validation

X = data.drop(['Disease'], axis = 1)
y = symptoms['Disease']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
len(y.unique())

41

In [None]:
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(3936, 17) (984, 17) (3936,) (984,)


In [None]:
# Standardize the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Convert labels to integer values
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

In [None]:
np.unique(y_train)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40])

In [None]:
X_train

array([[ 1.21553572, -0.12630268, -0.74126606, ..., -0.21917692,
        -0.18957515, -0.11794217],
       [ 1.21553572, -0.12630268,  2.09688881, ..., -0.21917692,
        -0.18957515, -0.11794217],
       [-0.31057228, -0.93595638, -0.74126606, ..., -0.21917692,
        -0.18957515, -0.11794217],
       ...,
       [-0.31057228,  0.68335101, -0.74126606, ..., -0.21917692,
        -0.18957515, -0.11794217],
       [-0.31057228,  2.30265839, -0.03172734, ..., -0.21917692,
        -0.18957515, -0.11794217],
       [-0.31057228,  0.68335101,  2.09688881, ..., -0.21917692,
        -0.18957515, -0.11794217]])

In [None]:
model = keras.Sequential([
    keras.layers.Dense(100, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(41, activation='softmax')
])

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa51ebf8130>

In [None]:
loss, accuracy = model.evaluate(X_val, y_val)
print("Test loss:", loss)
print("Test accuracy:", accuracy)

Test loss: 0.10039190948009491
Test accuracy: 0.9756097793579102


In [None]:
predictions = model.predict(X_val)

