# Task 1: Create a Prescription Parser using CRF
This task tests your ability to build a Doctor Prescription Parser with the help of CRF model

Your job is to build a Prescription Parser that takes a prescription (sentence) as an input and find / label the words in that sentence with one of the already pre-defined labels

### Problem: SEQUENCE PREDICTION - Label words in a sentence
#### Input : Doctor Prescription in the form of a sentence split into tokens
- Ex: Take 2 tablets once a day for 10 days

#### Output : FHIR Labels
- ('Take', 'Method')
- ('2', 'Qty') 
- ('tablets', 'Form')
- ('once', 'Frequency')
- ('a', 'Period') 
- ('day', 'PeriodUnit')
- ('for', 'FOR')
- ('10', 'Duration')
- ('days', 'DurationUnit') 

### Major Steps
- Install necessary library
- Import the libraries
- Create training data with labels
    - Split the sentence into tokens
    - Compute POS tags
    - Create triples
- Extract features
- Split the data into training and testing set
- Create CRF model
- Save the CRF model
- Load the CRF model
- Predict on test data
- Accuracy

#### Install necesaary library

In [1]:
# !pip install python-crfsuite
# !pip install sklearn_crfsuite

#### Import the necessary libraries

In [2]:
# Import libraries
import nltk
import sklearn_crfsuite
from sklearn.metrics import make_scorer, confusion_matrix, f1_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from nltk.tag import pos_tag
from pprint import pprint
import string
from sklearn.model_selection import train_test_split
import pycrfsuite
from sklearn.metrics import classification_report
from itertools import chain

### Input data (GIVEN)
#### Creating the inputs to the ML model in the following form:
- sigs --> ['take 3 tabs for 10 days']       INPUT SIG
- input_sigs --> [['take', '3', 'tabs', 'for', '10', 'days']]      TOKENS
- output_labels --> [['Method','Qty', 'Form', 'FOR', 'Duration', 'DurationUnit']]       LABELS

In [3]:
sigs = ["for 5 to 6 days", "inject 2 units", "x 2 weeks", "x 3 days", "every day", "every 2 weeks", "every 3 days", "every 1 to 2 months", "every 2 to 6 weeks", "every 4 to 6 days", "take two to four tabs", "take 2 to 4 tabs", "take 3 tabs orally bid for 10 days at bedtime", "swallow three capsules tid orally", "take 2 capsules po every 6 hours", "take 2 tabs po for 10 days", "take 100 caps by mouth tid for 10 weeks", "take 2 tabs after an hour", "2 tabs every 4-6 hours", "every 4 to 6 hours", "q46h", "q4-6h", "2 hours before breakfast", "before 30 mins at bedtime", "30 mins before bed", "and 100 tabs twice a month", "100 tabs twice a month", "100 tabs once a month", "100 tabs thrice a month", "3 tabs daily for 3 days then 1 tab per day at bed", "30 tabs 10 days tid", "take 30 tabs for 10 days three times a day", "qid q6h", "bid", "qid", "30 tabs before dinner and bedtime", "30 tabs before dinner & bedtime", "take 3 tabs at bedtime", "30 tabs thrice daily for 10 days ", "30 tabs for 10 days three times a day", "Take 2 tablets a day", "qid for 10 days", "every day", "take 2 caps at bedtime", "apply 3 drops before bedtime", "take three capsules daily", "swallow 3 pills once a day", "swallow three pills thrice a day", "apply daily", "apply three drops before bedtime", "every 6 hours", "before food", "after food", "for 20 days", "for twenty days", "with meals"]
input_sigs = [['for', '5', 'to', '6', 'days'], ['inject', '2', 'units'], ['x', '2', 'weeks'], ['x', '3', 'days'], ['every', 'day'], ['every', '2', 'weeks'], ['every', '3', 'days'], ['every', '1', 'to', '2', 'months'], ['every', '2', 'to', '6', 'weeks'], ['every', '4', 'to', '6', 'days'], ['take', 'two', 'to', 'four', 'tabs'], ['take', '2', 'to', '4', 'tabs'], ['take', '3', 'tabs', 'orally', 'bid', 'for', '10', 'days', 'at', 'bedtime'], ['swallow', 'three', 'capsules', 'tid', 'orally'], ['take', '2', 'capsules', 'po', 'every', '6', 'hours'], ['take', '2', 'tabs', 'po', 'for', '10', 'days'], ['take', '100', 'caps', 'by', 'mouth', 'tid', 'for', '10', 'weeks'], ['take', '2', 'tabs', 'after', 'an', 'hour'], ['2', 'tabs', 'every', '4-6', 'hours'], ['every', '4', 'to', '6', 'hours'], ['q46h'], ['q4-6h'], ['2', 'hours', 'before', 'breakfast'], ['before', '30', 'mins', 'at', 'bedtime'], ['30', 'mins', 'before', 'bed'], ['and', '100', 'tabs', 'twice', 'a', 'month'], ['100', 'tabs', 'twice', 'a', 'month'], ['100', 'tabs', 'once', 'a', 'month'], ['100', 'tabs', 'thrice', 'a', 'month'], ['3', 'tabs', 'daily', 'for', '3', 'days', 'then', '1', 'tab', 'per', 'day', 'at', 'bed'], ['30', 'tabs', '10', 'days', 'tid'], ['take', '30', 'tabs', 'for', '10', 'days', 'three', 'times', 'a', 'day'], ['qid', 'q6h'], ['bid'], ['qid'], ['30', 'tabs', 'before', 'dinner', 'and', 'bedtime'], ['30', 'tabs', 'before', 'dinner', '&', 'bedtime'], ['take', '3', 'tabs', 'at', 'bedtime'], ['30', 'tabs', 'thrice', 'daily', 'for', '10', 'days'], ['30', 'tabs', 'for', '10', 'days', 'three', 'times', 'a', 'day'], ['take', '2', 'tablets', 'a', 'day'], ['qid', 'for', '10', 'days'], ['every', 'day'], ['take', '2', 'caps', 'at', 'bedtime'], ['apply', '3', 'drops', 'before', 'bedtime'], ['take', 'three', 'capsules', 'daily'], ['swallow', '3', 'pills', 'once', 'a', 'day'], ['swallow', 'three', 'pills', 'thrice', 'a', 'day'], ['apply', 'daily'], ['apply', 'three', 'drops', 'before', 'bedtime'], ['every', '6', 'hours'], ['before', 'food'], ['after', 'food'], ['for', '20', 'days'], ['for', 'twenty', 'days'], ['with', 'meals']]
output_labels = [['FOR', 'Duration', 'TO', 'DurationMax', 'DurationUnit'], ['Method', 'Qty', 'Form'], ['FOR', 'Duration', 'DurationUnit'], ['FOR', 'Duration', 'DurationUnit'], ['EVERY', 'Period'], ['EVERY', 'Period', 'PeriodUnit'], ['EVERY', 'Period', 'PeriodUnit'], ['EVERY', 'Period', 'TO', 'PeriodMax', 'PeriodUnit'], ['EVERY', 'Period', 'TO', 'PeriodMax', 'PeriodUnit'], ['EVERY', 'Period', 'TO', 'PeriodMax', 'PeriodUnit'], ['Method', 'Qty', 'TO', 'Qty', 'Form'], ['Method', 'Qty', 'TO', 'Qty', 'Form'], ['Method', 'Qty', 'Form', 'PO', 'BID', 'FOR', 'Duration', 'DurationUnit', 'AT', 'WHEN'], ['Method', 'Qty', 'Form', 'TID', 'PO'], ['Method', 'Qty', 'Form', 'PO', 'EVERY', 'Period', 'PeriodUnit'], ['Method', 'Qty', 'Form', 'PO', 'FOR', 'Duration', 'DurationUnit'], ['Method', 'Qty', 'Form', 'BY', 'PO', 'TID', 'FOR', 'Duration', 'DurationUnit'], ['Method', 'Qty', 'Form', 'AFTER', 'Period', 'PeriodUnit'], ['Qty', 'Form', 'EVERY', 'Period', 'PeriodUnit'], ['EVERY', 'Period', 'TO', 'PeriodMax', 'PeriodUnit'], ['Q46H'], ['Q4-6H'], ['Qty', 'PeriodUnit', 'BEFORE', 'WHEN'], ['BEFORE', 'Qty', 'M', 'AT', 'WHEN'], ['Qty', 'M', 'BEFORE', 'WHEN'], ['AND', 'Qty', 'Form', 'Frequency', 'Period', 'PeriodUnit'], ['Qty', 'Form', 'Frequency', 'Period', 'PeriodUnit'], ['Qty', 'Form', 'Frequency', 'Period', 'PeriodUnit'], ['Qty', 'Form', 'Frequency', 'Period', 'PeriodUnit'], ['Qty', 'Form', 'Frequency', 'FOR', 'Duration', 'DurationUnit', 'THEN', 'Qty', 'Form', 'Frequency', 'PeriodUnit', 'AT', 'WHEN'], ['Qty', 'Form', 'Duration', 'DurationUnit', 'TID'], ['Method', 'Qty', 'Form', 'FOR', 'Duration', 'DurationUnit', 'Qty', 'TIMES', 'Period', 'PeriodUnit'], ['QID', 'Q6H'], ['BID'], ['QID'],['Qty', 'Form', 'BEFORE', 'WHEN', 'AND', 'WHEN'], ['Qty', 'Form', 'BEFORE', 'WHEN', 'AND', 'WHEN'], ['Method', 'Qty', 'Form', 'AT', 'WHEN'], ['Qty', 'Form', 'Frequency', 'DAILY', 'FOR', 'Duration', 'DurationUnit'], ['Qty', 'Form', 'FOR', 'Duration', 'DurationUnit', 'Frequency', 'TIMES', 'Period', 'PeriodUnit'], ['Method', 'Qty', 'Form', 'Period', 'PeriodUnit'], ['QID', 'FOR', 'Duration', 'DurationUnit'], ['EVERY', 'PeriodUnit'], ['Method', 'Qty', 'Form', 'AT', 'WHEN'], ['Method', 'Qty', 'Form', 'BEFORE', 'WHEN'], ['Method', 'Qty', 'Form', 'DAILY'], ['Method', 'Qty', 'Form', 'Frequency', 'Period', 'PeriodUnit'], ['Method', 'Qty', 'Form', 'Frequency', 'Period', 'PeriodUnit'], ['Method', 'DAILY'], ['Method', 'Qty', 'Form', 'BEFORE', 'WHEN'], ['EVERY', 'Period', 'PeriodUnit'], ['BEFORE', 'FOOD'], ['AFTER', 'FOOD'], ['FOR', 'Duration', 'DurationUnit'], ['FOR', 'Duration', 'DurationUnit'], ['WITH', 'FOOD']]

In [4]:
len(sigs), len(input_sigs) , len(output_labels)

(56, 56, 56)

### Creating a Tuples Maker method
Create the tuples as given below by writing a function **tuples_maker(input_sigs, output_labels)** and returns **output** as given below

Input(s): 
- input_sigs
- output_lables

Output:

[[('for', 'FOR'),
  ('5', 'Duration'),
  ('to', 'TO'),
  ('6', 'DurationMax'),
  ('days', 'DurationUnit')], [second sentence], ...]

In [5]:
def tuples_maker(input_sigs, output_labels):
    sample_data = []
    for tokens, labels in zip(input_sigs, output_labels):
        tuple_data = list(zip(tokens, labels))
        sample_data.append(tuple_data)
    return sample_data

result = tuples_maker(input_sigs, output_labels)

In [6]:
print(result)

[[('for', 'FOR'), ('5', 'Duration'), ('to', 'TO'), ('6', 'DurationMax'), ('days', 'DurationUnit')], [('inject', 'Method'), ('2', 'Qty'), ('units', 'Form')], [('x', 'FOR'), ('2', 'Duration'), ('weeks', 'DurationUnit')], [('x', 'FOR'), ('3', 'Duration'), ('days', 'DurationUnit')], [('every', 'EVERY'), ('day', 'Period')], [('every', 'EVERY'), ('2', 'Period'), ('weeks', 'PeriodUnit')], [('every', 'EVERY'), ('3', 'Period'), ('days', 'PeriodUnit')], [('every', 'EVERY'), ('1', 'Period'), ('to', 'TO'), ('2', 'PeriodMax'), ('months', 'PeriodUnit')], [('every', 'EVERY'), ('2', 'Period'), ('to', 'TO'), ('6', 'PeriodMax'), ('weeks', 'PeriodUnit')], [('every', 'EVERY'), ('4', 'Period'), ('to', 'TO'), ('6', 'PeriodMax'), ('days', 'PeriodUnit')], [('take', 'Method'), ('two', 'Qty'), ('to', 'TO'), ('four', 'Qty'), ('tabs', 'Form')], [('take', 'Method'), ('2', 'Qty'), ('to', 'TO'), ('4', 'Qty'), ('tabs', 'Form')], [('take', 'Method'), ('3', 'Qty'), ('tabs', 'Form'), ('orally', 'PO'), ('bid', 'BID'), ('

### Creating the triples_maker( ) for feature extraction
- input: tuples_maker_output
- output: 
[[('for', 'IN', 'FOR'),
  ('5', 'CD', 'Duration'),
  ('to', 'TO', 'TO'),
  ('6', 'CD', 'DurationMax'),
  ('days', 'NNS', 'DurationUnit')], [second sentence], ... ]

In [7]:
def triples_maker(whole_data):
    sample_data = []
    for sentence_tuples in whole_data:
        triple_data = [(token, None, label) for token, label in sentence_tuples]
        sample_data.append(triple_data)
    return sample_data

# Assuming whole_data is the result from tuples_maker
result = tuples_maker(input_sigs, output_labels)

In [8]:
sample_data = triples_maker(result)
print(sample_data)

[[('for', None, 'FOR'), ('5', None, 'Duration'), ('to', None, 'TO'), ('6', None, 'DurationMax'), ('days', None, 'DurationUnit')], [('inject', None, 'Method'), ('2', None, 'Qty'), ('units', None, 'Form')], [('x', None, 'FOR'), ('2', None, 'Duration'), ('weeks', None, 'DurationUnit')], [('x', None, 'FOR'), ('3', None, 'Duration'), ('days', None, 'DurationUnit')], [('every', None, 'EVERY'), ('day', None, 'Period')], [('every', None, 'EVERY'), ('2', None, 'Period'), ('weeks', None, 'PeriodUnit')], [('every', None, 'EVERY'), ('3', None, 'Period'), ('days', None, 'PeriodUnit')], [('every', None, 'EVERY'), ('1', None, 'Period'), ('to', None, 'TO'), ('2', None, 'PeriodMax'), ('months', None, 'PeriodUnit')], [('every', None, 'EVERY'), ('2', None, 'Period'), ('to', None, 'TO'), ('6', None, 'PeriodMax'), ('weeks', None, 'PeriodUnit')], [('every', None, 'EVERY'), ('4', None, 'Period'), ('to', None, 'TO'), ('6', None, 'PeriodMax'), ('days', None, 'PeriodUnit')], [('take', None, 'Method'), ('two', N

### Creating the features extractor method (GIVEN as a BASELINE)
#### The features used are:
- SOS, EOS, lowercase, uppercase, title, digit, postag, previous_tag, next_tag
#### Feel free to include more features

In [9]:
def token_to_features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Additional features
    features.extend([
        'word.shape=' + get_word_shape(word),
        'word.prefix=' + word[:3],
        'has-hyphen=%s' % ('-' in word),
        'has_apostrophe=%s' % ("'" in word),
        'postag[:2]=' + postag[:2]
    ])

    # Features for words that are not at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2]
        ])
    else:
        features.append('BOS')

    # Features for words that are not at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2]
        ])
    else:
        features.append('EOS')

    return features

# Define the missing get_word_shape function
def get_word_shape(word):
    shape = ''
    for char in word:
        if char.islower():
            shape += 'x'
        elif char.isupper():
            shape += 'X'
        elif char.isdigit():
            shape += 'd'
        else:
            shape += char
    return shape

### Running the feature extractor on the training data 
- Feature extraction
- Train-test-split

In [10]:
# Placeholder function for feature extraction
def extract_features(sentence):
    # This is a simple example; replace it with your actual feature extraction logic
    return [{'token': token, 'is_capitalized': token[0].isupper()} for token in sentence]

# Add the following function to convert labels to a flat list
def flatten_labels(labels):
    return [label for sublist in labels for label in sublist]

# Placeholder function for reading labeled data
def read_labeled_data():
    # This is a simple example; replace it with your actual logic to read labeled data
    # Assume your data is a list of tuples where the first element is the sentence (list of tokens)
    # and the second element is a list of labels for each token
    return zip(input_sigs, output_labels)

# Extract features and labels from your data
X = [extract_features(sentence) for sentence, _ in read_labeled_data()]
y = [labels for _, labels in read_labeled_data()]

### Training the CRF model with the features extracted using the feature extractor method

In [11]:
# # Split the data into training and testing sets (80% training, 20% testing)
# split_idx = int(0.8 * len(X))
# X_train, X_test = X[:split_idx], X[split_idx:]
# y_train, y_test = y[:split_idx], y[split_idx:]

# Split your data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the CRF model
trainer = pycrfsuite.Trainer(verbose=True)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function
model_file = 'Prescription_Parser.model'
trainer.train(model_file)

# Print the model training details
print(trainer.logparser.last_iteration)

# Train the model and save it to a file
trainer.train('crf_model.crfsuite')

# Load the trained model
tagger = pycrfsuite.Tagger()
tagger.open('crf_model.crfsuite')

# Make predictions on the test set
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Flatten the true and predicted labels
y_test_flat = flatten_labels(y_test)
y_pred_flat = flatten_labels(y_pred)

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 1008
Seconds required: 0.002

L-BFGS optimization
c1: 1.000000
c2: 0.001000
num_memories: 6
max_iterations: 50
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 661.001241
Feature norm: 1.000000
Error norm: 42.883098
Active features: 60
Line search trials: 1
Line search step: 0.023153
Seconds required for this iteration: 0.001

***** Iteration #2 *****
Loss: 560.047357
Feature norm: 22.443334
Error norm: 64.914320
Active features: 60
Line search trials: 4
Line search step: 0.125000
Seconds required for this iteration: 0.008

***** Iteration #3 *****
Loss: 340.391886
Feature norm: 21.971700
Error norm: 20.767957
Active features: 62
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.002



In [12]:
# Print classification report
print(classification_report(y_test_flat, y_pred_flat))

              precision    recall  f1-score   support

         AND       0.00      0.00      0.00         1
          AT       1.00      1.00      1.00         1
      BEFORE       0.67      1.00      0.80         2
         BID       0.00      0.00      0.00         2
    Duration       1.00      1.00      1.00         4
 DurationMax       0.00      0.00      0.00         1
DurationUnit       1.00      0.75      0.86         4
       EVERY       1.00      1.00      1.00         3
         FOR       1.00      1.00      1.00         4
        Form       1.00      1.00      1.00         5
   Frequency       0.33      1.00      0.50         1
      Method       1.00      1.00      1.00         3
          PO       0.00      0.00      0.00         2
      Period       0.67      1.00      0.80         4
   PeriodMax       0.50      1.00      0.67         1
  PeriodUnit       0.80      1.00      0.89         4
         QID       0.00      0.00      0.00         0
         Qty       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# Print the length of ground truth and predicted labels
print("Length of test labels:", len(y_test_flat))
print("Length of predicted labels:", len(y_pred_flat))

Length of test labels: 54
Length of predicted labels: 54


### Predicting the test data with the built model

In [14]:
# Define the test data
test_data = [
    "take 2 tabs every 6 hours x 10 days",
    "2 capsu for 10 day at bed",
    "2 capsu for 10 days at bed",
    "5 days 2 tabs at bed",
    "3 tabs qid x 10 weeks",
    "x 30 days",
    "x 20 months",
    "take 2 tabs po tid for 10 days",
    "take 2 capsules po every 6 hours",
    "inject 2 units pu tid",
    "swallow 3 caps tid by mouth",
    "inject 3 units orally",
    "orally take 3 tabs tid",
    "by mouth take three caps",
    "take 3 tabs orally three times a day for 10 days at bedtime",
    "take 3 tabs orally bid for 10 days at bedtime",
    "take 3 tabs bid orally at bed",
    "take 10 capsules by mouth qid",
    "inject 10 units orally qid x 3 months",
    "please take 2 tablets per day for a month in the morning and evening each day",
    "Amoxcicillin QID 30 tablets",
    "take 3 tabs TID for 90 days with food",
    "with food take 3 tablets per day for 90 days",
    "with food take 3 tablets per week for 90 weeks",
    "take 2-4 tabs",
    "take 2 to 4 tabs",
    "take two to four tabs",
    "take 2-4 tabs for 8 to 9 days",
    "take 20 tabs every 6 to 8 days",
    "take 2 tabs every 4 to 6 days",
    "take 2 tabs every 2 to 10 weeks",
    "take 2 tabs every 4 to 6 days",
    "take 2 tabs every 2 to 10 months",
    "every 60 mins",
    "every 10 mins",
    "every two to four months",
    "take 2 tabs every 3 to 4 days",
    "every 3 to 4 days take 20 tabs",
    "once in every 3 days take 3 tabs",
    "take 3 tabs once in every 3 days",
    "orally take 20 tabs every 4-6 weeks",
    "10 tabs x 2 days",
    "3 capsule x 15 days",
    "10 tabs"
]

# Define the tokenize function
def tokenize(sentence):
    return sentence.split()

# Extract features for the test data
X_test_data = [extract_features(tokenize(sentence)) for sentence in test_data]

# Load the trained CRF model
tagger = pycrfsuite.Tagger()
tagger.open(model_file)  # Use the same name as during training

# Make predictions on the test data
y_pred_test_data = [tagger.tag(xseq) for xseq in X_test_data]

# Print the predictions for inspection
for i, (sentence, pred_labels) in enumerate(zip(test_data, y_pred_test_data)):
    print(f"Sentence {i + 1}:")
    print(f"Original Sentence: {sentence}")
    print(f"Predicted Labels: {pred_labels}")
    print()

Sentence 1:
Original Sentence: take 2 tabs every 6 hours x 10 days
Predicted Labels: ['Method', 'Qty', 'Form', 'EVERY', 'Period', 'PeriodUnit', 'FOR', 'Duration', 'DurationUnit']

Sentence 2:
Original Sentence: 2 capsu for 10 day at bed
Predicted Labels: ['Qty', 'Form', 'FOR', 'Duration', 'DurationUnit', 'AT', 'WHEN']

Sentence 3:
Original Sentence: 2 capsu for 10 days at bed
Predicted Labels: ['Qty', 'Form', 'FOR', 'Duration', 'DurationUnit', 'AT', 'WHEN']

Sentence 4:
Original Sentence: 5 days 2 tabs at bed
Predicted Labels: ['Duration', 'DurationUnit', 'Qty', 'Form', 'AT', 'WHEN']

Sentence 5:
Original Sentence: 3 tabs qid x 10 weeks
Predicted Labels: ['Qty', 'Form', 'QID', 'FOR', 'Duration', 'DurationUnit']

Sentence 6:
Original Sentence: x 30 days
Predicted Labels: ['Method', 'Qty', 'Form']

Sentence 7:
Original Sentence: x 20 months
Predicted Labels: ['Method', 'Qty', 'Form']

Sentence 8:
Original Sentence: take 2 tabs po tid for 10 days
Predicted Labels: ['Method', 'Qty', 'Form'

### Putting all the prediction logic inside a predict method

In [15]:
def predict(sig):
    """
    predict(sig)
    Purpose: Labels the given sig into corresponding labels
    @param sig: A Sentence  # A medical prescription sig written by a doctor
    @return: A list      # A list with predicted labels (first level of labeling)
    """
    # Tokenize the input sentence
    tokens = tokenize(sig)

    # Extract features for the input sentence
    features = extract_features(tokens)

    # Load the trained CRF model
    tagger = pycrfsuite.Tagger()
    tagger.open(model_file)  # Use the same name as during training

    # Make predictions on the input data
    pred_labels = tagger.tag(features)

    # Organize predictions into the required format
    predictions = organize_predictions(tokens, pred_labels)

    return predictions

def organize_predictions(tokens, pred_labels):
    """
    Organize the raw predictions into the required format.
    This function may need to be customized based on your specific label mapping.

    @param tokens: List of tokens in the sentence
    @param pred_labels: Predicted labels for each token
    @return: List with predicted labels (organized format)
    """
    organized_predictions = []

    # Iterate through tokens and predicted labels
    for token, label in zip(tokens, pred_labels):
        # Customize this part based on your specific label mapping
        organized_predictions.append([label, ...])

    return organized_predictions

### Sample predictions

In [16]:
prediction = predict("take 2 tabs every 6 hours x 10 days")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['EVERY', Ellipsis], ['Period', Ellipsis], ['PeriodUnit', Ellipsis], ['FOR', Ellipsis], ['Duration', Ellipsis], ['DurationUnit', Ellipsis]]


In [17]:
prediction = predict("2 capsu for 10 day at bed")
print(prediction)

[['Qty', Ellipsis], ['Form', Ellipsis], ['FOR', Ellipsis], ['Duration', Ellipsis], ['DurationUnit', Ellipsis], ['AT', Ellipsis], ['WHEN', Ellipsis]]


In [18]:
prediction = predict("2 capsu for 10 days at bed")
print(prediction)

[['Qty', Ellipsis], ['Form', Ellipsis], ['FOR', Ellipsis], ['Duration', Ellipsis], ['DurationUnit', Ellipsis], ['AT', Ellipsis], ['WHEN', Ellipsis]]


In [19]:
prediction = predict("5 days 2 tabs at bed")
print(prediction)

[['Duration', Ellipsis], ['DurationUnit', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['AT', Ellipsis], ['WHEN', Ellipsis]]


In [20]:
prediction = predict("3 tabs qid x 10 weeks")
print(prediction)

[['Qty', Ellipsis], ['Form', Ellipsis], ['QID', Ellipsis], ['FOR', Ellipsis], ['Duration', Ellipsis], ['DurationUnit', Ellipsis]]


In [21]:
prediction = predict("x 30 days")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis]]


In [22]:
prediction = predict("x 20 months")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis]]


In [23]:
prediction = predict("take 2 tabs po tid for 10 days")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['PO', Ellipsis], ['TID', Ellipsis], ['FOR', Ellipsis], ['Duration', Ellipsis], ['DurationUnit', Ellipsis]]


In [24]:
prediction = predict("take 2 capsules po every 6 hours")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['PO', Ellipsis], ['EVERY', Ellipsis], ['Period', Ellipsis], ['PeriodUnit', Ellipsis]]


In [25]:
prediction = predict("inject 2 units pu tid")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['Frequency', Ellipsis], ['TID', Ellipsis]]


In [26]:
prediction = predict("swallow 3 caps tid by mouth")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['Frequency', Ellipsis], ['Period', Ellipsis], ['PeriodUnit', Ellipsis]]


In [27]:
prediction = predict("inject 3 units orally")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['Frequency', Ellipsis]]


In [28]:
prediction = predict("orally take 3 tabs tid")
print(prediction)

[['QID', Ellipsis], ['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['TID', Ellipsis]]


In [29]:
prediction = predict("by mouth take three caps")
print(prediction)

[['Qty', Ellipsis], ['Form', Ellipsis], ['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis]]


In [30]:
prediction = predict("take 3 tabs orally three times a day for 10 days at bedtime")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['Period', Ellipsis], ['PeriodUnit', Ellipsis], ['FOR', Ellipsis], ['Duration', Ellipsis], ['DurationUnit', Ellipsis], ['AT', Ellipsis], ['WHEN', Ellipsis]]


In [31]:
prediction = predict("take 3 tabs orally bid for 10 days at bedtime")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['Frequency', Ellipsis], ['Period', Ellipsis], ['FOR', Ellipsis], ['Duration', Ellipsis], ['DurationUnit', Ellipsis], ['AT', Ellipsis], ['WHEN', Ellipsis]]


In [32]:
prediction = predict("take 3 tabs bid orally at bed")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['Frequency', Ellipsis], ['Period', Ellipsis], ['AT', Ellipsis], ['WHEN', Ellipsis]]


In [33]:
prediction = predict("take 10 capsules by mouth qid")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['Frequency', Ellipsis], ['Period', Ellipsis], ['PeriodUnit', Ellipsis]]


In [34]:
prediction = predict("inject 10 units orally qid x 3 months")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['Frequency', Ellipsis], ['QID', Ellipsis], ['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis]]


In [35]:
prediction = predict("please take 2 tablets per day for a month in the morning and evening each day")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['Frequency', Ellipsis], ['Period', Ellipsis], ['PeriodUnit', Ellipsis], ['FOR', Ellipsis], ['Duration', Ellipsis], ['DurationUnit', Ellipsis], ['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['AND', Ellipsis], ['EVERY', Ellipsis], ['Period', Ellipsis], ['PeriodUnit', Ellipsis]]


In [36]:
prediction = predict("Amoxcicillin QID 30 tablets")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['Frequency', Ellipsis]]


In [37]:
prediction = predict("take 3 tabs TID for 90 days with food")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['Frequency', Ellipsis], ['FOR', Ellipsis], ['Duration', Ellipsis], ['DurationUnit', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis]]


In [38]:
prediction = predict("with food take 3 tablets per day for 90 days")
print(prediction)

[['Qty', Ellipsis], ['Form', Ellipsis], ['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['Period', Ellipsis], ['PeriodUnit', Ellipsis], ['FOR', Ellipsis], ['Duration', Ellipsis], ['DurationUnit', Ellipsis]]


In [39]:
prediction = predict("with food take 3 tablets per week for 90 weeks")
print(prediction)

[['Qty', Ellipsis], ['Form', Ellipsis], ['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['Frequency', Ellipsis], ['Period', Ellipsis], ['FOR', Ellipsis], ['Duration', Ellipsis], ['DurationUnit', Ellipsis]]


In [40]:
prediction = predict("take 2-4 tabs")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis]]


In [41]:
prediction = predict("take 2 to 4 tabs")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['TO', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis]]


In [42]:
prediction = predict("take two to four tabs")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['TO', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis]]


In [43]:
prediction = predict("take 2-4 tabs for 8 to 9 days")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['FOR', Ellipsis], ['Duration', Ellipsis], ['TO', Ellipsis], ['PeriodMax', Ellipsis], ['PeriodUnit', Ellipsis]]


In [44]:
prediction = predict("take 20 tabs every 6 to 8 days")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['EVERY', Ellipsis], ['Period', Ellipsis], ['TO', Ellipsis], ['PeriodMax', Ellipsis], ['PeriodUnit', Ellipsis]]


In [45]:
prediction = predict("take 2 tabs every 4 to 6 days")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['EVERY', Ellipsis], ['Period', Ellipsis], ['TO', Ellipsis], ['PeriodMax', Ellipsis], ['PeriodUnit', Ellipsis]]


In [46]:
prediction = predict("take 2 tabs every 2 to 10 weeks")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['EVERY', Ellipsis], ['Period', Ellipsis], ['TO', Ellipsis], ['PeriodMax', Ellipsis], ['PeriodUnit', Ellipsis]]


In [47]:
prediction = predict("take 2 tabs every 4 to 6 days")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['EVERY', Ellipsis], ['Period', Ellipsis], ['TO', Ellipsis], ['PeriodMax', Ellipsis], ['PeriodUnit', Ellipsis]]


In [48]:
prediction = predict("take 2 tabs every 2 to 10 months")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['EVERY', Ellipsis], ['Period', Ellipsis], ['TO', Ellipsis], ['PeriodMax', Ellipsis], ['PeriodUnit', Ellipsis]]


In [49]:
prediction = predict("every 60 mins")
print(prediction)

[['EVERY', Ellipsis], ['Period', Ellipsis], ['PeriodUnit', Ellipsis]]


In [50]:
prediction = predict("every 10 mins")
print(prediction)

[['EVERY', Ellipsis], ['Period', Ellipsis], ['PeriodUnit', Ellipsis]]


In [51]:
prediction = predict("every two to four months")
print(prediction)

[['EVERY', Ellipsis], ['Period', Ellipsis], ['TO', Ellipsis], ['PeriodMax', Ellipsis], ['PeriodUnit', Ellipsis]]


In [52]:
prediction = predict("take 2 tabs every 3 to 4 days")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['EVERY', Ellipsis], ['Period', Ellipsis], ['TO', Ellipsis], ['PeriodMax', Ellipsis], ['PeriodUnit', Ellipsis]]


In [53]:
prediction = predict("every 3 to 4 days take 20 tabs")
print(prediction)

[['EVERY', Ellipsis], ['Period', Ellipsis], ['TO', Ellipsis], ['PeriodMax', Ellipsis], ['PeriodUnit', Ellipsis], ['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis]]


In [54]:
prediction = predict("once in every 3 days take 3 tabs")
print(prediction)

[['Qty', Ellipsis], ['Form', Ellipsis], ['EVERY', Ellipsis], ['Period', Ellipsis], ['PeriodUnit', Ellipsis], ['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis]]


In [55]:
prediction = predict("take 3 tabs once in every 3 days")
print(prediction)

[['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['Frequency', Ellipsis], ['Period', Ellipsis], ['EVERY', Ellipsis], ['Period', Ellipsis], ['PeriodUnit', Ellipsis]]


In [56]:
prediction = predict("orally take 20 tabs every 4-6 weeks")
print(prediction)

[['QID', Ellipsis], ['Method', Ellipsis], ['Qty', Ellipsis], ['Form', Ellipsis], ['EVERY', Ellipsis], ['Period', Ellipsis], ['PeriodUnit', Ellipsis]]


In [57]:
prediction = predict("10 tabs x 2 days")
print(prediction)

[['Qty', Ellipsis], ['Form', Ellipsis], ['FOR', Ellipsis], ['Duration', Ellipsis], ['DurationUnit', Ellipsis]]


In [58]:
prediction = predict("3 capsule x 15 days")
print(prediction)

[['Qty', Ellipsis], ['Form', Ellipsis], ['FOR', Ellipsis], ['Duration', Ellipsis], ['DurationUnit', Ellipsis]]


In [59]:
prediction = predict("10 tabs")
print(prediction)

[['Qty', Ellipsis], ['Form', Ellipsis]]
