In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import spacy

nlp = spacy.load("en_core_web_sm")


# Read data from the URL for test / train sentences.

# Constructs proper sentences from individual words and prints five sentences

# Correctly counts the number of sentences in the processed train and test dataset
# Correctly counts the number of lines of labels in the processed train and test datasets

In [None]:
## Read data from the url
def read_data_from_url(url):
  response = requests.get(url)
  data = response.text

  return data

In [None]:
### Read data from the train sentences url
train_sentences_url = "https://raw.githubusercontent.com/TheExorcist/NER-datasets/main/train_sent"
train_sentences_data = read_data_from_url(train_sentences_url)

In [150]:
def generate_sentences(data):
  sentences = []
  sentence = ""
  for line in data.split('\n'):
    line = line.strip()
    if line == "":
      sentences.append(sentence.strip())
      sentence = ""
    else:
      sentence += line + " "
  return sentences


['All', 'live', 'births', '>', 'or', '=', '23', 'weeks', 'at', 'the', 'University', 'of', 'Vermont', 'in', '1995', '(', 'n', '=', '2395', ')', 'were', 'retrospectively', 'analyzed', 'for', 'delivery', 'route', ',', 'indication', 'for', 'cesarean', ',', 'gestational', 'age', ',', 'parity', ',', 'and', 'practice', 'group', '(', 'to', 'reflect', 'risk', 'status', ')', '.']


In [151]:
## convert sentence tokens to the sentences using spacy
train_sentences = generate_sentences(train_sentences_data)
train_sentences[0:5]

['All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )',
 'The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )',
 'Abnormal presentation was the most common indication ( 25.6 % , 88 of 344 )',
 "The `` corrected '' cesarean rate ( maternal-fetal medicine and transported patients excluded ) was 12.4 % ( 273 of 2194 ) , and the `` corrected '' primary rate was 9.6 % ( 190 of 1975 )",
 "Arrest of dilation was the most common indication in both `` corrected '' subgroups ( 23.4 and 24.6 % , respectively )"]

In [152]:
## length of the sentences
print(len(train_sentences))

2600


In [None]:
# count the number of lines in the train labels.
train_label_url = "https://raw.githubusercontent.com/TheExorcist/NER-datasets/main/train_label"
train_label_data = read_data_from_url(train_label_url)
train_label_sentences = generate_sentences(train_label_data)

In [None]:
# count the number of train labels.
len(train_label_sentences)

2600

In [177]:
## Read data from the test sentences
test_sentences_url = "https://raw.githubusercontent.com/TheExorcist/NER-datasets/main/test_sent"
test_sentences_data = read_data_from_url(test_sentences_url)
test_sentences = generate_sentences(test_sentences_data)

In [178]:
## print length of the test sentence
print(len(test_sentences))

1057


# Writes the code to get a list of labels of a given preprocessed label line that you have created earlie

In [None]:
## Read data from the test label
test_label_url = "https://raw.githubusercontent.com/TheExorcist/NER-datasets/main/test_label"
test_label_data = read_data_from_url(test_label_url)
test_label_sentences = generate_sentences(test_label_data)

In [None]:
## print length of the test labels
print(len(test_label_sentences))

1057


# Uses a toolkit like spaCy to extract those tokens that have NOUN or PROPN as their PoS tag and finds their frequency from the entire dataset that comprises both the train and the test datasets

In [None]:
from collections import Counter

def count_pos_tags(sentences):
  pos_tag = ['NOUN', 'PROPN']
  counter = Counter()
  for sentence in sentences:
    doc = nlp(sentence)
    for token in doc:
      if token.pos_ in pos_tag:
        counter[token.text] += 1
  return counter

In [None]:
all_sentences = train_sentences + test_sentences
token_counter = count_pos_tags(all_sentences)

# Prints the top 25 most common tokens with NOUN or PROPN PoS tags for the entire dataset that comprises both the train and the test datasets

In [None]:
token_counter.most_common(25)

[('patients', 492),
 ('treatment', 281),
 ('%', 247),
 ('cancer', 200),
 ('therapy', 175),
 ('study', 154),
 ('disease', 142),
 ('cell', 140),
 ('lung', 116),
 ('group', 94),
 ('gene', 88),
 ('chemotherapy', 88),
 ('effects', 85),
 ('results', 79),
 ('women', 77),
 ('use', 74),
 ('TO_SEE', 74),
 ('risk', 71),
 ('cases', 71),
 ('surgery', 71),
 ('analysis', 70),
 ('rate', 67),
 ('response', 66),
 ('survival', 65),
 ('children', 64)]

In [249]:
## print label lines
train_labels = list(map(lambda x: x.strip(), train_label_sentences))
train_labels = list(map(lambda x: x.split(), train_labels))
train_labels[0:5]

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']]

# Extracts the features' values for each sentence as an input variable for the CRF model in the test and the train datasets

In [258]:
def extract_features(doc, index):
    """
    Extract features for a token at a given index in the document, including sentence boundary markers.

    Args:
    - doc (spacy.tokens.Doc): A spaCy Doc object containing the processed text.
    - index (int): The index of the token for which to extract features.

    Returns:
    - dict: A dictionary of features for the token at the specified index.
    """
    token = doc[index]

    # Basic features of the current token
    features = {
        'text': token.text,                    # Current token text
        'lowercase': token.text.lower(),       # Lowercase version of the token
        'pos': token.pos_,                     # Coarse-grained POS tag of the current token
        'tag': token.tag_,                     # Fine-grained POS tag of the current token
        'shape': token.shape_,                 # Shape of the current token (e.g., 'Xxxxx', 'd' for digits)
        'is_alpha': token.is_alpha,            # Whether the token is alphabetic
        'is_title': token.is_title,            # Whether the token is title-cased
        'is_digit': token.is_digit,            # Whether the token is a digit
        'is_stop': token.is_stop               # Whether the token is a stop word
    }

    # Features from the previous token, if it exists
    if index > 0:
        prev_token = doc[index - 1]
        features.update({
            'prev_text': prev_token.text,      # Previous token text
            'prev_lowercase': prev_token.text.lower(),  # Lowercase of the previous token
            'prev_pos': prev_token.pos_,       # Coarse-grained POS tag of the previous token
            'prev_tag': prev_token.tag_,       # Fine-grained POS tag of the previous token
            'prev_shape': prev_token.shape_,   # Shape of the previous token
            'prev_is_alpha': prev_token.is_alpha,  # Whether the previous token is alphabetic
            'prev_is_title': prev_token.is_title,  # Whether the previous token is title-cased
            'prev_is_digit': prev_token.is_digit,  # Whether the previous token is a digit
            'prev_is_stop': prev_token.is_stop  # Whether the previous token is a stop word
        })
    else:
        # Special features for the first token where there is no previous token
        features.update({
            'prev_text': '<START>',
            'prev_pos': '<START>',
            'prev_tag': '<START>',
            'prev_shape': '<START>',
            'prev_is_alpha': False,
            'prev_is_title': False,
            'prev_is_digit': False,
            'prev_is_stop': False
        })

    # Sentence boundary features
    features.update({
        'is_sentence_start': token.is_sent_start,  # Whether the token is at the start of a sentence
        'is_sentence_end': token.is_sent_end       # Whether the token is at the end of a sentence
    })

    return features


train_features = []
for index, sentence in enumerate(train_sentences):
    doc = nlp(sentence)
    train_feature_list = []
    for i in range(len(doc)):
      train_feature_list.append(extract_features(doc, i))
    train_features.append(train_feature_list[0: len(train_labels[index])])

In [238]:
print(train_features[0:30])

[[{'text': 'All', 'lowercase': 'all', 'pos': 'DET', 'tag': 'DT', 'shape': 'Xxx', 'is_alpha': True, 'is_title': True, 'is_digit': False, 'is_stop': True, 'prev_text': '<START>', 'prev_pos': '<START>', 'prev_tag': '<START>', 'prev_shape': '<START>', 'prev_is_alpha': False, 'prev_is_title': False, 'prev_is_digit': False, 'prev_is_stop': False, 'is_sentence_start': True, 'is_sentence_end': False}, {'text': 'live', 'lowercase': 'live', 'pos': 'ADJ', 'tag': 'JJ', 'shape': 'xxxx', 'is_alpha': True, 'is_title': False, 'is_digit': False, 'is_stop': False, 'prev_text': 'All', 'prev_lowercase': 'all', 'prev_pos': 'DET', 'prev_tag': 'DT', 'prev_shape': 'Xxx', 'prev_is_alpha': True, 'prev_is_title': True, 'prev_is_digit': False, 'prev_is_stop': True, 'is_sentence_start': False, 'is_sentence_end': False}, {'text': 'births', 'lowercase': 'births', 'pos': 'NOUN', 'tag': 'NNS', 'shape': 'xxxx', 'is_alpha': True, 'is_title': False, 'is_digit': False, 'is_stop': False, 'prev_text': 'live', 'prev_lowercas

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']]

In [219]:
print(len(train_features))
print(len(train_labels))

2600
2600


In [179]:
## features for the test sentenances
test_features = []
for sentence in test_sentences:
    doc = nlp(sentence)
    feature_list = []
    for i in range(len(doc)):
        feature_list.append(extract_features(doc, i))

    test_features.append(feature_list)

test_features[0:5]

[[{'text': 'Furthermore',
   'lowercase': 'furthermore',
   'pos': 'ADV',
   'tag': 'RB',
   'shape': 'Xxxxx',
   'is_alpha': True,
   'is_title': True,
   'is_digit': False,
   'is_stop': False,
   'prev_text': '<START>',
   'prev_pos': '<START>',
   'prev_tag': '<START>',
   'prev_shape': '<START>',
   'prev_is_alpha': False,
   'prev_is_title': False,
   'prev_is_digit': False,
   'prev_is_stop': False,
   'is_sentence_start': True,
   'is_sentence_end': False},
  {'text': ',',
   'lowercase': ',',
   'pos': 'PUNCT',
   'tag': ',',
   'shape': ',',
   'is_alpha': False,
   'is_title': False,
   'is_digit': False,
   'is_stop': False,
   'prev_text': 'Furthermore',
   'prev_lowercase': 'furthermore',
   'prev_pos': 'ADV',
   'prev_tag': 'RB',
   'prev_shape': 'Xxxxx',
   'prev_is_alpha': True,
   'prev_is_title': True,
   'prev_is_digit': False,
   'prev_is_stop': False,
   'is_sentence_start': False,
   'is_sentence_end': False},
  {'text': 'when',
   'lowercase': 'when',
   'pos': 

In [181]:
## get the train feature length

print(len(test_features))
print(len(test_labels))

1057
1057


# Extracts the labels as the target variable for the test and the train datasets

In [182]:
test_labels = list(map(lambda x: x.strip(), test_label_sentences))
test_labels = list(map(lambda x: x.split(), test_labels))
test_labels[0:5]

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]

In [158]:
!pip install sklearn-crfsuite



In [174]:
def validating_dimensions_for_training_dataset(training_features, training_labels):
  mismatch_count = 0

  for training_feature, training_label in zip(training_features, training_labels):
    if len(training_feature) != len(training_label):
      # print(training_feature, training_label)
      # print(len(training_feature), len(training_label))
      mismatch_count += 1
  print(mismatch_count)
  return True

In [255]:
validating_dimensions_for_training_dataset(train_features, train_labels)

2599


True

# Predicts the labels of each of the tokens in each sentence of the test dataset that has been preprocessed earlier

Builds the CRF model for a custom NER application


In [259]:
## using sklearn_crfsuite to Build the CRF model for a custom NER application
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(max_iterations=100)
crf.fit(train_features, train_labels)
test_predictions = crf.predict(test_features)
test_predictions[0:5]


array([list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']),
       list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']),
       list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']),
       list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']),
       list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])],
      dtype=object)

In [221]:
print(len(test_predictions))
print(len(test_labels))

1057
1057


# Calculate the F1 score for the predicted label.

In [225]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score


# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit and transform the actual and predicted labels
actual_binary = mlb.fit_transform(test_labels)
predicted_binary = mlb.transform(test_predictions)

# Calculate the F1 score with 'micro' average for multi-label
f1 = f1_score(actual_binary, predicted_binary, average='micro')

print(f'F1 Score: {f1:.4f}')


F1 Score: 0.8247


# Get the mapping diseases and treatment mapping

Creates the code or logic to get all the predicted treatments (T) labels corresponding to each disease (D) label in the test dataset **bold text**


In [208]:
treatment_mapping = {}
diseases_list = set([])

def prepare_disease_and_treatment_mapping(test_features, test_predictions):
  for test_feature, test_prediction in zip(test_features, test_predictions):
    for feature, prediction in zip(test_feature, test_prediction):
      select_all_the_diseases = filter(lambda x: x[1] == 'D', zip(test_feature, test_prediction))
      select_all_the_diseases = map(lambda x: x[0]['text'], select_all_the_diseases)
      diseases = list(select_all_the_diseases)

      if prediction == 'D':
        diseases_list.add(feature['text'])

      if prediction == 'T':
        if feature['text'] == 'radiotherapy':
          print(list(map(lambda x: x['text'], test_feature)))
        treatment_mapping[feature['text']] = diseases

print(treatment_mapping)

{}


In [233]:
prepare_disease_and_treatment_mapping(test_features, test_predictions)

['A', '66', '-', 'year', '-', 'old', 'male', 'engineer', 'diagnosed', 'with', 'malignant', 'pleural', 'mesothelioma', '4', 'years', 'previously', 'had', 'thoracotomy', ',', 'radiotherapy', ',', 'and', 'chemotherapy']
['purpose', ':', 'many', 'patients', 'with', 'locally', 'advanced', 'non', '-', 'small', '-', 'cell', 'lung', 'cancer', '(', 'la', '-', 'nsclc', ')', 'are', 'eligible', 'for', 'combined', '-', 'modality', 'therapy', '(', 'cmt', ';', 'chemotherapy', 'and', 'radiotherapy', ')']
['we', 'aimed', 'to', 'investigate', 'whether', 'biological', 'factors', 'related', 'to', 'radiosensitivity', 'and', 'chemosensitivity', 'have', 'prognostic', 'significance', 'in', 'non', '-', 'small', '-', 'cell', '-', 'lung', '-', 'cancer', '(', 'nsclc', ')', 'patients', 'treated', 'with', 'daily', 'low', 'doses', 'of', 'cisplatin', 'and', 'radiotherapy']
['for', 'intrathoracic', 'relapse', 'and', 'severe', 'obstruction', 'of', 'main', 'bronchus', 'and/or', 'superior', 'caval', 'vein', ',', 'radioth

In [202]:
treatment_mapping

{'methylphenidate': ['attention', 'deficit', 'hyperactivity', 'disorder'],
 'Antichlamydial': [],
 'antibiotics': ['symptoms', 'of', 'a', 'common', 'cold'],
 'electrical': [],
 'nerve': [],
 'stimulation': [],
 '(': ['inflammatory', 'and', 'autoimmune', 'diseases'],
 'ENS': [],
 ')': ['inflammatory', 'and', 'autoimmune', 'diseases'],
 'therapy': ['cerebral', 'palsy'],
 'muscle': [],
 'EMS': [],
 'intravenous': ['inflammatory', 'and', 'autoimmune', 'diseases'],
 'antibiotic': ['bacterial',
  'meningitis',
  'Haemophilus',
  'influenzae',
  'meningitis'],
 'treatment': [],
 'fenfluramine': ['cardiac', 'disease'],
 '-': [],
 'phentermine': ['cardiac', 'disease'],
 'pylori': [],
 'intrauterine': ['preeclampsia', '(', 'proteinuric', 'hypertension', ')'],
 'insemination': ['preeclampsia', '(', 'proteinuric', 'hypertension', ')'],
 'with': ['chronic', 'hepatitis', 'C'],
 'donor': ['preeclampsia', '(', 'proteinuric', 'hypertension', ')'],
 'sperm': [],
 'versus': ['preeclampsia', '(', 'protein

In [226]:
diseases_list

{"'s",
 '(',
 ')',
 ',',
 '-',
 '3',
 'A2',
 'AIDS',
 'Acute',
 'Alzheimer',
 'Blastocystis',
 'C',
 'Cancer',
 'Chlamydia',
 'Contemporary',
 'Crohn',
 'Cushing',
 'Darier',
 'Eisenmenger',
 'Gorlin',
 'Gynaecological',
 'HIV',
 'Haemophilus',
 'IIB',
 'IVB',
 'Interferon',
 'Intramedullary',
 'Johne',
 'Kaposi',
 'Metastatic',
 'Myoepithelial',
 'Ovarian',
 'Ovine',
 'PPH',
 'Parkinson',
 'Pendred',
 'Progressive',
 'Q',
 'RSV',
 'Spontaneous',
 'TNM',
 'a',
 'abdominal',
 'accompanying',
 'achalasia',
 'acute',
 'adenocarcinoma',
 'administration',
 'adrenal',
 'adrenocorticotropic',
 'advanced',
 'against',
 'amyloid',
 'and',
 'anemia',
 'angina',
 'anomalies',
 'anterior',
 'aplasia',
 'arthritis',
 'as',
 'associated',
 'asthma',
 'atrial',
 'attention',
 'attributable',
 'autoimmune',
 'autologous',
 'bacterial',
 'bilateral',
 'biliary',
 'bladder',
 'bowel',
 'brain',
 'breast',
 'bronchial',
 'bronchiolitis',
 'bronchogenic',
 'cancer',
 'carbon',
 'carcinoma',
 'cardiac',
 

# Predicts the treatment for the disease named 'hereditary retinoblastoma'

In [None]:
## disease not found in the list, my bad.