# Building an NLP model to classify medical terminology, over 200k samples


In [132]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
# Getting the data

!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git

In [134]:
# Reading the data from each file into a variable

def readlines(filename):

  with open(filename, 'r') as f:
    return f.readlines()
    

In [135]:
# Setting our unprepared data into variables

train_data_unprepared = readlines('pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt')
test_data_unprepared = readlines('pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/test.txt')
valid_data_unprepared = readlines('pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/dev.txt')

In [136]:
# function to preprocess our data into a more convenient way to store it into a DataFrame

def transform_Into_DataFrame(file_data):
  text = ''
  target = ''
  total_lines = 0
  line_number = 0
  stoppingPos = 0
  line_Nr = 0
  data = []

  for i, line in enumerate(file_data):
    if line.startswith('\n'):
      continue
    if line.startswith('###'):
      line_Nr = 0
      if line_number != 0:

        for abstract in data[stoppingPos:]:
          abstract['total_lines'] = total_lines
        stoppingPos = line_number
        total_lines = 0

      continue
    target = line.partition("\t")[0]
    text = line.partition("\t")[2][:-1]
    
    data.append({'line_number': line_Nr , 'target':target, 'text':text, 'total_lines':0})
    line_number+=1
    line_Nr +=1
    total_lines+=1
    text = ''
    target = ''

  return data
  

In [137]:
# Setting the variables

train_data = transform_Into_DataFrame(train_data_unprepared)
test_data = transform_Into_DataFrame(test_data_unprepared)
valid_data = transform_Into_DataFrame(valid_data_unprepared)

In [138]:
# Transforming the data into DataFrame

train_data_df = pd.DataFrame(train_data)
test_data_df = pd.DataFrame(test_data)
valid_data_df = pd.DataFrame(valid_data)

In [139]:
# Preparing the X and y data independent and dependent variables

train_labels = train_data_df['target']
train_sentences = train_data_df['text']

valid_labels = valid_data_df['target']
valid_sentences = valid_data_df['text']

test_labels = test_data_df['target']
test_sentences = test_data_df['text']


In [140]:
# Tokenising our data transforming the labels into numbers using OneHot

from sklearn.preprocessing import OneHotEncoder

onehot = OneHotEncoder(sparse=False)

train_labels = onehot.fit_transform(train_labels.to_numpy().reshape(-1,1))
valid_labels = onehot.fit_transform(valid_labels.to_numpy().reshape(-1,1))
test_labels = onehot.fit_transform(test_labels.to_numpy().reshape(-1,1))

In [141]:
# Tokenising the sentences using tensorflow experimental preprocessing function `TextVectorization()` function
# turning the tokens(words) into numbers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

train_vectorizer = TextVectorization()
train_vectorizer.adapt(train_sentences)


In [142]:
# Preparing the embedded function later to be used when constructing the model
# 68000 words to understand, and embedd a word with 512 values

embedded = tf.keras.layers.Embedding(68000, 512)

In [143]:
# Creating the model

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
X = train_vectorizer(inputs)
X = embedded(X)

# I will be using LSTM(Long-Short Term Memory) because of its ability to memorize patterns faster

X = tf.keras.layers.LSTM(64)(X)
outputs = tf.keras.layers.Dense(5, activation='softmax')(X)
model = tf.keras.Model(inputs,outputs, name='model')

In [144]:
# Use the tensorflow datasets for faster computing later when fitting the model 

train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels))
valid_dataset = tf.data.Dataset.from_tensor_slices((valid_sentences, valid_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels))

# prefetching the data
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

In [145]:
# Compiling the model using Adam as our optimizer

model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Fitting the model to the data using only 10% of the data from our batches for faster computing and testing

model.fit(train_dataset,
            epochs=8,
            steps_per_epoch = int(0.1*len(train_dataset)),
            validation_data = valid_dataset,
            validation_steps=int(0.1*len(valid_dataset)));

In [147]:
# Reinstantiating the code to train it from 0

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
X = train_vectorizer(inputs)
X = embedded(X)

X = tf.keras.layers.LSTM(64)(X)
outputs = tf.keras.layers.Dense(5, activation='softmax')(X)
model = tf.keras.Model(inputs,outputs, name='model')

In [148]:
# Compiling the model 
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Setting a callback for Learning Rate

lr_callback = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-4 * 10 ** (epoch/20))
history = model.fit(train_dataset,
            epochs=8,
            steps_per_epoch = int(0.1*len(train_dataset)),
            validation_data = valid_dataset,
            validation_steps=int(0.1*len(valid_dataset)),
            callbacks=[lr_callback])

In [None]:
# Plotting the learning rate vs. Loss in order to determine the right value

lrs = 1e-4 * (10 ** (tf.range(8)/20))
plt.figure(figsize=(10,7))
plt.semilogx(lrs, history.history['loss'])
plt.xlabel("Learning Rate")
plt.ylabel("Loss")
plt.title('Learning rate vs. Loss');

In [None]:
# Testing our model predicions

print('Sentence is : ', valid_sentences[0])
pred = model.predict([valid_sentences[0]])
index_res = tf.math.argmax(pred[0]).numpy()
labels = ['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS']
print('Predicted: ', labels[index_res])