# Identifying Entities in Healthcare Data

##Workspace set up: Import and Install useful packages.

In [None]:
!pip install pycrf
!pip install sklearn-crfsuite
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
    
import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics
#import en-core_web_sm

model = spacy.load("en_core_web_sm")

##Data Preprocessing

In [14]:
def process_file(filename):
  input_file = open(filename, 'r')
  file_content = input_file.readlines() 
  input_file.close()

  out_lines = [] #To store list of sequences (sentences or labels)

  line_content = ""

  for word in file_content:
    word = word.strip() 
    if word == "": # If empty line, add the current sequence to out_lines
      out_lines.append(line_content)
      line_content = ""; # re-initialize
    else:
      if line_content: #if non-empty, add new word after space
        line_content += " "+word
      else:
        line_content = word # first word, no space required

  return out_lines

In [15]:
train_sentences = process_file('train_sent')
train_labels = process_file('train_label')
test_sentences = process_file('test_sent')
test_labels = process_file('test_label')

In [16]:
# Print the 5 sentences from the processed dataset
for i in range(5):
  print("Sentence:", train_sentences[i])
  print("Labels:", train_labels[i], "\n\n")

Sentence: All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )
Labels: O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O 


Sentence: The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )
Labels: O O O O O O O O O O O O O O O O O O O O O O O O O 


Sentence: Abnormal presentation was the most common indication ( 25.6 % , 88 of 344 )
Labels: O O O O O O O O O O O O O O O 


Sentence: The `` corrected '' cesarean rate ( maternal-fetal medicine and transported patients excluded ) was 12.4 % ( 273 of 2194 ) , and the `` corrected '' primary rate was 9.6 % ( 190 of 1975 )
Labels: O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O 


Sentence: Arrest of dilation was the most common indication in both `` co

### Count the number of sentences in the processed train and test dataset 

In [17]:
print("No. of lines in train_sentences:", len(train_sentences))
print("No. of lines in test_sentences:", len(test_sentences))

No. of lines in train_sentences: 2599
No. of lines in test_sentences: 1056


### Count the number of lines of labels in the processed train and test dataset.

In [18]:
# The lengths of the four variables should match the below output
print("No. of lines in train_labels:", len(train_labels))
print("No. of lines in test_labels:", len(test_labels))

No. of lines in train_labels: 2599
No. of lines in test_labels: 1056


In [25]:
import pandas as pd

In [26]:
# Creating a list to hold all the tokens which are either NOUN or PROPER NOUN
noun_propn_tokens_list = []

In [27]:

for sentences in (train_sentences, test_sentences):
  for sentence in sentences:
    processed = model(sentence) # Process each sentence by spacy model
    for token in processed:
      if(token.pos_ == 'NOUN' or token.pos_ == 'PROPN'): #check if the token is a noun
         noun_propn_tokens_list.append(each_token.text); #increase its frequency if it is noun
        

NameError: name 'model' is not defined

### Extract those tokens which have NOUN or PROPN as their PoS tag and find their frequency

In [28]:
df_noun_propn = pd.Series(noun_propn_tokens_list)
df_noun_propn.value_counts().sort_values(ascending=False).head(25)

  df_noun_propn = pd.Series(noun_propn_tokens_list)


Series([], dtype: int64)

### Print the top 25 most common tokens with NOUN or PROPN PoS tags

The output can be tested if the top 25 most common concepts and their frequencies match the following output.

In [None]:
# Let's define the features to get the feature value for one word.

def getFeaturesForOneWord(sentence, pos, pos_tags):
  word = sentence[pos]

  #Define 12 features with PoS tag as one of the features
  features = [
    'word.lower=' + word.lower(), # serves as word id
    'word[-3:]=' + word[-3:],     # last three characters
    'word[-2:]=' + word[-2:],     # last two characters
    'word.isupper=%s' % word.isupper(),  # is the word in all uppercase
    'word.isdigit=%s' % word.isdigit(),  # is the word a number
    'word.startsWithCapital=%s' % word[0].isupper(), # is the word starting with a capital letter
    'word.pos=' + pos_tags[pos]
  ]

  #Use the previous word also while defining features
  if(pos > 0):
    prev_word = sentence[pos-1]
    features.extend([
    'prev_word.lower=' + prev_word.lower(), 
    'prev_word.isupper=%s' % prev_word.isupper(),
    'prev_word.isdigit=%s' % prev_word.isdigit(),
    'prev_word.startsWithCapital=%s' % prev_word[0].isupper(),
    'prev_word.pos=' + pos_tags[pos-1]
  ])
  # Mark the begining and the end words of a sentence correctly in the form of features.
  else:
    features.append('BEG') # feature to track begin of sentence 

  if(pos == len(sentence)-1):
    features.append('END') # feature to track end of sentence

  return features

## Getting the features

### Define a function to get the features for a sentence

In [None]:
# Define a function to get features for a sentence using the 'getFeaturesForOneWord' function.
def getFeaturesForOneSentence(sentence):
  
  processed = model(sentence) #spacy is applied to sentence
  
  pos_tags = [] #correctly identify pos tags
  for token in processed:
    pos_tags.append(token.pos_)

  sentence_list = sentence.split() # List of words in sentence
  
  #Correctly calling getFeaturesForOneWord defined above
  return [getFeaturesForOneWord(sentence_list, pos, pos_tags) for pos in range(len(sentence_list))]

### Define a function to get the labels of a sentence

In [None]:
# Define a function to get the labels for a sentence.
def getLabelsInListForOneSentence(labels):
  return labels.split()

In [None]:
X_train = [getFeaturesForOneSentence(sentence) for sentence in train_sentences]
X_test = [getFeaturesForOneSentence(sentence) for sentence in test_sentences]

In [None]:
Y_train = [getLabelsInListForOneSentence(labels) for labels in train_labels]
Y_test = [getLabelsInListForOneSentence(labels) for labels in test_labels]

## Build the CRF Model

In [None]:
# Build the CRF model.

# Calling CRF 
crf = sklearn_crfsuite.CRF(max_iterations=100)

# Check that only X_train and Y_train are passed
crf.fit(X_train, Y_train)

## Evaluation

In [None]:
Y_pred = crf.predict(X_test)

### Calculate the f1 score using the actual labels and the predicted labels of the test dataset.

In [None]:
metrics.flat_f1_score(Y_test, Y_pred, average='weighted')

In [None]:
# Creating an empty dictionary to hold diseases and their corresponding treatments
dict_D = dict()

for i in range(len(Y_pred)):
    val = Y_pred[i]
    
    # Empty strings to store the values of Diseases and Treatments
    Diseases = ""
    Treatments = ""
    for j in range(len(val)):
        if val[j] == 'D': # If label is D, it indicates a Disease 
            Diseases += test_sentences[i].split()[j] + " "
        elif val[j] == 'T': # If label is T, it indicates a Treatment
            Treatments += test_sentences[i].split()[j] + " "
            
    # Removes any extra whitespaces to either end of the string
    Diseases = Diseases.lstrip().rstrip()
    Treatments = Treatments.lstrip().rstrip()

    # If Diseases and Treatments are blank, ignore them
    # If Disease is not present in Dictionary, add it along with the corresponding treatment
    # If Disease is present in the Dictionary, append the treatments for that diseases with existing
    # treatments
    if Diseases != "" and Treatments != "":
        if Diseases in dict_D.keys():
            treat_out = list(dict_D[Diseases])
            treat_out.append(Treatments)
            dict_D[Diseases] = treat_out
        elif Diseases not in dict_D.keys():
            dict_D[Diseases] = Treatments

### Predict the treatment for the disease name: 'hereditary retinoblastoma'

This is just to check the dictionary created. 

In [None]:
D_T_dict['hereditary retinoblastoma']