In [206]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy
import json
import re
import glob

### Class to preprocess text

In [94]:
class Preprocess_text:
    def __init__(self, text_list) -> None:
        self.preprocessed = self.get_lemmatized(
            self.to_lowercase(
                self.remove_stopwords(
                    self.remove_punc(text_list))))

    # REMOVE PUNCTUATION
    def remove_punc(self, body_texts):
        X_no_punc = []
        for sentence in body_texts:
            # Remove all punctuations except for '.'
            remove = string.punctuation.replace('.', '')
            remove = remove.replace('/', '')
            remove = remove.replace('-', '')
            remove = remove.replace(':', '')
            X_no_punc.append(sentence.translate(
                sentence.maketrans('', '', remove)))
        return X_no_punc

    # REMOVE STOPWORDS
    def remove_stopwords(self, text_list):
        # Removing stopwords from dataset.
        # Tokenizing data set.
        X_tokenized = []
        for sentence in text_list:
            X_tokenized.append(word_tokenize(sentence))

        X_no_stopwords = []
        stop_words = set(stopwords.words('english'))
        for word_list in X_tokenized:
            filtered_words = [
                word for word in word_list if word.lower() not in stop_words]
            X_no_stopwords.append(" ".join(filtered_words))
        return X_no_stopwords

    # CONVERT TO LOWERCASE
    def to_lowercase(self, text_list):
        X_lower = []
        for sentence in text_list:
            X_lower.append(sentence.lower())
        return X_lower

    # LEMMATISATION
    def get_lemmatized(self, text_list):
        lemmatizer = WordNetLemmatizer()
        X_preprocessed = []
        for sentence in text_list:
            tokens = word_tokenize(sentence)
            lemmatized_sentence = " ".join(
                [lemmatizer.lemmatize(word) for word in tokens])
            X_preprocessed.append(lemmatized_sentence)
        return X_preprocessed

### Function definitions

In [213]:
'''
This function reads an OCR obtained .txt file and returns a list of the
lines in the file. It also gets rid of any blank lines.
'''
def read_file(file_name):
    lines = []
    with open(file_name, 'r') as file:
        for line in file:
            # Remove whitespaces on either end
            stripped_line = line.strip()

            # Only append lines which are not blank
            if len(stripped_line) > 0:
                lines.append(stripped_line)

    return lines

'''
This function reads the X1.json file and forms a dictionary where the keys
are the abbreviations and the values are sets of synonyms.
'''
def load_x1(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)

    parameter_dictionary = {}

    for item in data:
        abb = item["Abbreviation"].lower()

        # Conversion to set so that searching is more efficient (constant time).
        syns = set([each.lower() for each in item["Synonyms"]])

        if abb not in parameter_dictionary:
            parameter_dictionary[abb] = syns

    return parameter_dictionary

'''
This function checks whether a given parameter is valid by using the X1.json
file.
'''
def is_valid_parameter(param, parameter_dict):
    # Check if parameter is a key in the dictionary formed from X1.
    if param in parameter_dict:
        return True
    else:
        # Check if parameter is in the set of synonyms for each key.
        for key in parameter_dict:
            if (param in parameter_dict[key]):
                return True
        return False
    
'''
This function checks whether a number is a valid value.
'''
def validate_number(item, splits, index):
    try:
        # If number is valid or if the number is part of a range
        number = float(item)
        if index == len(splits) - 1:
            if "-" in splits[index - 1][-1]:
                return False
        elif index == 0:
            if "-" in splits[index + 1][0]:
                return False
        elif ("(" in splits[index - 1][-1]) and (")" in splits[index + 1][0]):
                return False
        else:
            if ("-" in splits[index - 1][-1]) or ("-" in splits[index + 1][0]):
                return False
            else:
                return True
    except:
        # If number cannot be parsed
        return False

'''
Function to extract data from the unstructured text.
'''
def extract_data(lines, parameter_dict):
    lod = []
    for line in lines:
        splits = line.split()
        if len(splits) > 0:
            if is_valid_parameter(splits[0], parameter_dict):

                # Obtaining parameter
                parameter = splits[0]

                # Choosing the latest value
                value = ""
                i = len(splits) - 1
                for item in reversed(splits):
                    if validate_number(item, splits, i):
                        value = item
                        break
                    i -= 1

                # Initialize unit to ""
                unit = ""
                for split in splits:
                    if '/' in split:
                        unit = split

                # Add dictionary to list if a value exists
                if value != "":
                    lod.append({"parameter": parameter,
                                "value": value,
                                "unit": unit})
    return lod

def make_structured(file_name):
    # Obtain lines from file
    lines = read_file(file_name)

    # Retrieve the parameter dictionary from X1.json
    param_dict = load_x1("X1.json")

    # Perform preprocessing like lowercasing, punctuation and stopword removal etc. 
    preprocessed_lines = Preprocess_text(lines).preprocessed

    # Obtain list of dictionaries
    lod = extract_data(preprocessed_lines, param_dict)
    return lod
                    
                

### Test approach for all text files

In [214]:
txt_files = glob.glob("*.txt")
lod_list = []
for f in txt_files:
    lod_list.append(make_structured(f))

In [215]:
lod_list[1]

[{'parameter': 'rbc', 'value': '4.48', 'unit': '5.50x10l4/l'},
 {'parameter': 'hct', 'value': '0.43', 'unit': ''},
 {'parameter': 'mcv', 'value': '91', 'unit': ''},
 {'parameter': 'mch', 'value': '28.1', 'unit': ''},
 {'parameter': 'mchc', 'value': '309', 'unit': 'g/l'},
 {'parameter': 'rdw', 'value': '12.9', 'unit': ''},
 {'parameter': 'esr', 'value': '21', 'unit': '/h'},
 {'parameter': 'sodium', 'value': '138', 'unit': '/l'},
 {'parameter': 'potassium', 'value': '4.5', 'unit': '/l'},
 {'parameter': 'chloride', 'value': '105', 'unit': '/l'},
 {'parameter': 'bicarbonate', 'value': '24', 'unit': '/l'},
 {'parameter': 'urea', 'value': '4.2', 'unit': '/l'},
 {'parameter': 'creatinine', 'value': '45', 'unit': '/l'},
 {'parameter': 'egfr', 'value': '59', 'unit': 'ml/min/1.73m2'},
 {'parameter': 'albumin', 'value': '37', 'unit': 'g/l'},
 {'parameter': 'alp', 'value': '68', 'unit': 'u/l'},
 {'parameter': 'ggt', 'value': '14', 'unit': 'u/l'},
 {'parameter': 'ast', 'value': '41', 'unit': 'u/l'}