Imports

In [19]:
import pandas as pd
import numpy as np
import tensorflow as tf
tf.gfile = tf.io.gfile
import tensorflow_hub as hub
import tensorflow_text as text
from bert import tokenization
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow import keras
import requests, json

Read in data and make it into a pd dataframe so that it is readable and useable

In [2]:
response = requests.get("http://127.0.0.1:5000/data")
dataVals = response.json()
data = dataVals.get("data")[0][1]

In [3]:
dfWhole = pd.DataFrame(columns=["Sentence","Condition"])
dfWhole

Unnamed: 0,Sentence,Condition


In [4]:
for condition in list(data.keys()):
    for sentence in data.get(condition):
        dfWhole.loc[len(dfWhole.index)] = [sentence,condition] 

In [5]:
dfWhole

Unnamed: 0,Sentence,Condition
0,spinning movements but i also feel unsteadines...,(vertigo) Paroymsal Positional Vertigo
1,spinning movements loss of balance spinning mo...,(vertigo) Paroymsal Positional Vertigo
2,spinning movements vomiting i feel like headac...,(vertigo) Paroymsal Positional Vertigo
3,spinning movements unsteadiness headache and i...,(vertigo) Paroymsal Positional Vertigo
4,spinning movements vomiting i also feel like u...,(vertigo) Paroymsal Positional Vertigo
...,...,...
161395,urine and i also feel like abdominal pain loss...,hepatitis A
161396,urine yellowing of appetite vomiting but i als...,hepatitis A
161397,urine diarrhoea joint pain and i feel joint pa...,hepatitis A
161398,urine vomiting yellowish skin mild fever but i...,hepatitis A


Now that we have the data in a dataframe format, we can work with the preprocessing and model

In [6]:
dfCopy = dfWhole.copy()
features = dfCopy["Sentence"]
label = dfCopy["Condition"]

le = LabelEncoder()
le.fit(label)
encoded_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
encoded_diseases = le.transform(label)

In [37]:
encoded_mapping = {'(vertigo) Paroymsal  Positional Vertigo': 0,
 'AIDS': 1,
 'Acne': 2,
 'Alcoholic hepatitis': 3,
 'Allergy': 4,
 'Arthritis': 5,
 'Bronchial Asthma': 6,
 'Cervical spondylosis': 7,
 'Chicken pox': 8,
 'Chronic cholestasis': 9,
 'Common Cold': 10,
 'Dengue': 11,
 'Diabetes ': 12,
 'Dimorphic hemmorhoids(piles)': 13,
 'Drug Reaction': 14,
 'Fungal infection': 15,
 'GERD': 16,
 'Gastroenteritis': 17,
 'Heart attack': 18,
 'Hepatitis B': 19,
 'Hepatitis C': 20,
 'Hepatitis D': 21,
 'Hepatitis E': 22,
 'Hypertension ': 23,
 'Hyperthyroidism': 24,
 'Hypoglycemia': 25,
 'Hypothyroidism': 26,
 'Impetigo': 27,
 'Jaundice': 28,
 'Malaria': 29,
 'Migraine': 30,
 'Osteoarthristis': 31,
 'Paralysis (brain hemorrhage)': 32,
 'Peptic ulcer diseae': 33,
 'Pneumonia': 34,
 'Psoriasis': 35,
 'Tuberculosis': 36,
 'Typhoid': 37,
 'Urinary tract infection': 38,
 'Varicose veins': 39,
 'hepatitis A': 40}

In [38]:
encoded_mapping2 = {}

for condition in list(encoded_mapping.keys()):
    encoded_mapping2[encoded_mapping.get(condition)] = condition

In [39]:
encoded_mapping2

{0: '(vertigo) Paroymsal  Positional Vertigo',
 1: 'AIDS',
 2: 'Acne',
 3: 'Alcoholic hepatitis',
 4: 'Allergy',
 5: 'Arthritis',
 6: 'Bronchial Asthma',
 7: 'Cervical spondylosis',
 8: 'Chicken pox',
 9: 'Chronic cholestasis',
 10: 'Common Cold',
 11: 'Dengue',
 12: 'Diabetes ',
 13: 'Dimorphic hemmorhoids(piles)',
 14: 'Drug Reaction',
 15: 'Fungal infection',
 16: 'GERD',
 17: 'Gastroenteritis',
 18: 'Heart attack',
 19: 'Hepatitis B',
 20: 'Hepatitis C',
 21: 'Hepatitis D',
 22: 'Hepatitis E',
 23: 'Hypertension ',
 24: 'Hyperthyroidism',
 25: 'Hypoglycemia',
 26: 'Hypothyroidism',
 27: 'Impetigo',
 28: 'Jaundice',
 29: 'Malaria',
 30: 'Migraine',
 31: 'Osteoarthristis',
 32: 'Paralysis (brain hemorrhage)',
 33: 'Peptic ulcer diseae',
 34: 'Pneumonia',
 35: 'Psoriasis',
 36: 'Tuberculosis',
 37: 'Typhoid',
 38: 'Urinary tract infection',
 39: 'Varicose veins',
 40: 'hepatitis A'}

In [7]:
encoded_diseases.reshape(len(encoded_diseases),1)
encoded_diseases.shape

(161400,)

In [8]:
diseases_column = pd.DataFrame(encoded_diseases,columns=["Disease"])
diseases_column

Unnamed: 0,Disease
0,0
1,0
2,0
3,0
4,0
...,...
161395,40
161396,40
161397,40
161398,40


In [9]:
preprocessed_data = pd.concat([features, diseases_column],axis=1)

In [18]:
#preprocessed_data.to_csv('preprocessed_data.csv',index=False)

Get a model with BERT set up

In [10]:
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True)



2022-10-02 19:55:21.575086: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
vocab_file = bert_encoder.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_encoder.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

Split into Train and test data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data["Sentence"],preprocessed_data["Disease"], stratify=None)

In [13]:
max_len = 150
train_input = bert_encode(X_train.values, tokenizer, max_len=max_len)
test_input = bert_encode(X_test.values, tokenizer, max_len=max_len)
train_labels = keras.utils.to_categorical(y_train.values, num_classes=41)

Define a Model

In [33]:
def model(metrics,bert_layer,max_len=512,):

    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(32, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(41, activation='softmax')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='categorical_crossentropy', metrics=metrics)
    
    return model

In [34]:
METRICS = [
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

bert_model = model(METRICS, bert_encoder, max_len=max_len)
bert_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 150)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 150)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 150)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 150, 768)]                'input_mask[0][0]',       

  super().__init__(name, **kwargs)


In [16]:
train_history = bert_model.fit(
    train_input, train_labels, 
    validation_split=0.2,
    epochs=3,
    verbose=1
)

Epoch 1/3
  28/3027 [..............................] - ETA: 44:42:35 - loss: 3.7267 - accuracy: 0.9756 - precision: 0.0000e+00 - recall: 0.0000e+00

After training in google colab: https://colab.research.google.com/drive/1ZmIIuCl_a4xXyjhImgMgU5jcbEsF9R7a?usp=sharing


Model loaded and evaluated in colab

View training history over time

In [None]:
plt.plot(train_history.history['accuracy'])
plt.plot(train_history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

Check the loss of model over time

In [None]:
plt.plot(train_history.history['loss'])
plt.plot(train_history.history['val_loss'])
plt.title('Model Cost')
plt.ylabel('Cost')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [35]:
weights_dir = "./model_weights/bert_weights"
bert_model.load_weights(weights_dir)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7fa4d7368b50>