In [138]:
# Import the neccesary libraries

import pandas as pd
import re
from tqdm import tqdm

In [139]:
# Path to the CSV data file - mounted from my Google Drive

data_path = "/content/drive/MyDrive/Task_data (1) (3).csv"

In [140]:
# Function to convert the CSV file to a dataframe. It takes the path to the data file as parameter and returns a dataframe of the data

def csv_to_df(path):
  datadf = pd.read_csv(path, low_memory=False)
  return datadf

In [141]:
# Create stop words from fundamental English stop words
# The stopwords.txt file is a collection of English words that should be eliminated because they do not add any important meaning

stopwords = set(w.rstrip() for w in open('/content/drive/MyDrive/stopwords.txt'))

# Adding more stopwords specific to this problem - as seen in the raw data
# This would be iterately expanded to optimize the performance of the tokenizer

stopwords = stopwords.union({'wa','reported'})

In [142]:
# This function takes a text description as a parameter and returns three lists which categorise the content of the text description into device
# patient, and any other one

def extract_keyword_corpus_df(text_desr):

  device_problem = []
  patient_problem = []
  other_problem = []
  
  global final_device_problem_keyphrases, final_patient_problem_keyphrases, final_other_problem_keyphrases
  
  text_list = text_desr.split('.')
  
  for text in text_list:
    if bool(re.search("device", text, re.IGNORECASE)):
      device_problem.append(text)
    
    elif bool(re.search("patient", text, re.IGNORECASE)):
      patient_problem.append(text)
      
    else:
      other_problem.append(text)
  
  if len(device_problem) > 0:
    device_problem_keyphrases = [d_problem.split('device')[1] for d_problem in device_problem]
    device_problem_keyphrases = [device_problem_keyphrases.split(',') for device_problem_keyphrases in device_problem_keyphrases]
    final_device_problem_keyphrases = [item.lstrip() for sublist in device_problem_keyphrases for item in sublist]
  
  elif len(device_problem) == 0:
    final_device_problem_keyphrases = device_problem
  
  if len(patient_problem) > 0:
    patient_problem_keyphrases = [p_problem.split('patient')[1] for p_problem in patient_problem]
    patient_problem_keyphrases = [patient_problem_keyphrases.split(',') for patient_problem_keyphrases in patient_problem_keyphrases]
    final_patient_problem_keyphrases = [item.lstrip() for sublist in patient_problem_keyphrases for item in sublist]
  
  elif len(patient_problem) == 0:
    final_patient_problem_keyphrases = patient_problem
  
  if len(other_problem) > 0:
    other_problem_keyphrases = [o_problem.split(',') for o_problem in other_problem]
    final_other_problem_keyphrases = [item.lstrip() for sublist in other_problem_keyphrases for item in sublist]
  
  elif len(other_problem) == 0:
    final_other_problem_keyphrases = other_problem
  
  return final_device_problem_keyphrases,  final_patient_problem_keyphrases, final_other_problem_keyphrases

In [143]:
# A tokenizer function, but modified to exclude Lemmatizer and NLTK tokenizer. Instead, split the sentence by white space to get each word of a 
# sentence as token. It also does not include the digits

def modified_tokenizer(sentence):
  
  eliminate_words = [',',"'",'.','(',')','{','}',':','e.g',';', '@', '{}', '()'] # This can also be increased to 
  sentence = sentence.lower() # downcase
  tokens = [t for t in sentence.split(" ")]
  tokens = [t for t in tokens if len(t) > 2 or len(t)==1]   # remove short words, they're probably not useful
  tokens = [t for t in tokens if t not in eliminate_words] # Remove those weird characters in the eliminate_words list
  tokens = [t for t in tokens if t not in stopwords] # remove stopwords
  #digits = [t for t in tokens if any(c.isdigit() for c in t)] # get the digits seperately (probably these are the scores)
  
  return " ".join(tokens)

In [144]:
# The n-gram function

def get_ngrams(content, n):
  n_gram_output = []
  
  for i in range(len(content)-n+1):
    n_gram_output.append(content[i:i+n])
  
  return n_gram_output

In [146]:
# Extract two or three-word sentences

def extract_three_or_two_sen(three_grams):
    three_words = []
    for ele_ in three_grams:
        for ele in ele_:
            x = ele.split(' ')
            if len(x) == 3 or len(x) == 2:
                three_words.append(ele)
    return three_words

In [147]:
# This function is to give us a corpus of all device, patient and other keywords from the entire dataset

def process_keywords_corpus(df):
  
  keyword_corpus_device_df = []
  keyword_corpus_patient_df = []
  keyword_corpus_other_df = []

  for text in df["TEXT DESCRIPTION"]:
    
    keyword_corpus_df = extract_keyword_corpus_df(text) #The extract_keyword_corpus function is called here
    
    if len(keyword_corpus_df[0]) > 0 and keyword_corpus_df[0][0] != "":
      for sen_d in keyword_corpus_df[0]:
        modified_send_d = modified_tokenizer(sen_d)
        keyword_corpus_device_df.append(modified_send_d)
    
    if len(keyword_corpus_df[1]) > 0 and keyword_corpus_df[1][0] != "":
      for sen_p in keyword_corpus_df[1]:
        modified_send_p = modified_tokenizer(sen_p)
        keyword_corpus_patient_df.append(modified_send_p)

    if len(keyword_corpus_df[2]) > 0 and keyword_corpus_df[2][0] != "":
      for sen_o in keyword_corpus_df[2]:
        modified_send_o = modified_tokenizer(sen_o)
        keyword_corpus_other_df.append(modified_send_o)

  n = 3

  three_two_sen_device = set(extract_three_or_two_sen(get_ngrams(keyword_corpus_device_df, n))) # Function chaining here: extract_three_or_two_sen and get_ngrams functions, with the
                                                                                                # respective corpus as parameter
  three_two_sen_patient = set(extract_three_or_two_sen(get_ngrams(keyword_corpus_patient_df, n)))
  three_two_sen_other = set(extract_three_or_two_sen(get_ngrams(keyword_corpus_other_df, n)))


  return three_two_sen_device, three_two_sen_patient, three_two_sen_other

In [148]:
# Global variables for the entire script

df = csv_to_df(data_path) # The dataframe is opened
keywords_corpus = process_keywords_corpus(df) # The entire corpus from the whole dataset
keywords_corpus_device = keywords_corpus[0] # The corpus for device keywords
keywords_corpus_patient = keywords_corpus[1] # The corpus for patient keywords
keywords_corpus_other = keywords_corpus[2] # The corpus for other keywords

In [152]:
# The function processes each text in the text description. It takes a text string as a parameter and return a dictionary

def process_keywordsdict_per_text(text):

  keywords_dict = {}
  
  device_keywords = []
  device_score = [] # Just initialized, but not populated
  patient_keywords = []
  patient_score = [] # Just initialized, but not populated
  other_keywords = []
  other_score = [] # Just initialized, but not populated

  keywords_per_text = extract_keyword_corpus_df(text) # The extract_keyword_corpus function is called here. The same function for the keyword corpus. 
                                                      # This time, however, it is called per text
  keywords_device = keywords_per_text[0]
  keywords_patient = keywords_per_text[1]
  keywords_other = keywords_per_text[2]

  if len(keywords_device) > 0:
    for sent_d in keywords_device:
      sentence_tokens_d = modified_tokenizer(sent_d) # The modified tokenizer function is called here
      
      if sentence_tokens_d in keywords_corpus_device: # To stick to two or three words
        device_keywords.append(sentence_tokens_d)

  if len(keywords_patient) > 0:
    for sent_p in keywords_patient:
      sentence_tokens_p = modified_tokenizer(sent_p) # The modified tokenizer function is called here
      
      if sentence_tokens_p in keywords_corpus_patient: # To stick to two or three words
        patient_keywords.append(sentence_tokens_p)
  
  if len(keywords_other) > 0:
    for sent_o in keywords_other:
      sentence_tokens_o = modified_tokenizer(sent_o) # The modified tokenizer function is called here
      
      if sentence_tokens_o in keywords_corpus_other: # To stick to two or three words
        other_keywords.append(sentence_tokens_o)

  keywords_dict["keywords (Device Problems)"] = device_keywords
  keywords_dict["keywords (Device Problems)_score"] = device_score
  keywords_dict["keywords (Patient Problems)"] = patient_keywords
  keywords_dict["keywords (Patient Problems)_score"] = patient_score
  keywords_dict["keywords others (interesting ones)"] = other_keywords
  keywords_dict["keywords others (interesting ones)_score"] = other_score

  return keywords_dict

In [155]:
#Function to process the data frame as required

def process_df(datadf):

  # row index starts from zero
  idx = 0

  while idx <= len(datadf)-1:

    for text in tqdm(datadf["TEXT DESCRIPTION"]):


      keywords_to_use = process_keywordsdict_per_text(text) # The extract_keywords_dict function is called here
      
      try:
        datadf.iloc[[idx],[3]] = keywords_to_use.get("keywords (Device Problems)", "Not available") # The value of the key from the dict is used to update the corresponding column
        datadf.iloc[[idx], [4]] = keywords_to_use.get("keywords (Device Problems)_score", "Not available")
        datadf.iloc[[idx], [5]] = keywords_to_use.get("keywords (Patient Problems)", "Not available")
        datadf.iloc[[idx], [6]] = keywords_to_use.get("keywords (Patient Problems)_score", "Not available")
        datadf.iloc[[idx], [7]] = keywords_to_use.get("keywords others (interesting ones)", "Not available")
        datadf.iloc[[idx], [8]] = keywords_to_use.get("keywords others (interesting ones)_score", "Not available")

      except ValueError: # This ValueError is exempted because we are passing lists of different lengths, as they contain different num of items to update the dataframe
        continue

    idx += 1

  return datadf

In [None]:
# The tqdm library for progress tracking and multiprocessing optimization
# The process_df function is called here

processed_df = process_df(df)

100%|██████████| 13868/13868 [00:06<00:00, 2307.07it/s]
100%|██████████| 13868/13868 [00:05<00:00, 2313.10it/s]
100%|██████████| 13868/13868 [00:06<00:00, 2229.63it/s]
100%|██████████| 13868/13868 [00:05<00:00, 2336.65it/s]
100%|██████████| 13868/13868 [00:06<00:00, 2276.26it/s]
100%|██████████| 13868/13868 [00:06<00:00, 2271.51it/s]
100%|██████████| 13868/13868 [00:05<00:00, 2315.86it/s]
100%|██████████| 13868/13868 [00:06<00:00, 2227.00it/s]
100%|██████████| 13868/13868 [00:05<00:00, 2316.46it/s]
100%|██████████| 13868/13868 [00:07<00:00, 1791.00it/s]
100%|██████████| 13868/13868 [00:05<00:00, 2336.50it/s]
100%|██████████| 13868/13868 [00:05<00:00, 2358.58it/s]
100%|██████████| 13868/13868 [00:05<00:00, 2338.31it/s]
100%|██████████| 13868/13868 [00:05<00:00, 2332.29it/s]
100%|██████████| 13868/13868 [00:05<00:00, 2336.77it/s]
100%|██████████| 13868/13868 [00:06<00:00, 2304.63it/s]
100%|██████████| 13868/13868 [00:05<00:00, 2324.36it/s]
100%|██████████| 13868/13868 [00:05<00:00, 2322.

In [None]:
# Show us the first 20 rows of the processed dataframe

processed_df.head(20)

In [None]:
# Save the processed data frame in CSV format - change the file name not to override the unprocessed data - for the whole dataframe

def save_processeddf(processed_df):
  processed_df.to_csv("/content/drive/MyDrive/processed_Task_data (1) (3).csv", encoding='utf-8', index=None)

In [None]:
# Call the save_processeddf function

save_processeddf(processed_df)

In [None]:
#Test script here, using the same texts for solutions 2 and 3 to see the quality of the processed data returned. This processed data 
# is expected to be used to populate the dataframe's columns requested

In [None]:
text_1 = '(b)(4). this reported event and subsequent repairs were investigated through the service repair process. failure data and parts-used information were reviewed for the sap and track wise files and found relevant to the service repair. a review of the device service history record was performed from the date of manufacture to the date corresponding to this service notification number. the database showed no quality notifications were opened for the device. a review of the device history record in sap for sn (b)(4) was performed from the date of the manufacture to date of the release of product, which confirmed that this device was not involved in a production failure, and product was returned for servicing which correlates to the customer reported issue. a trackwise complaint history review was completed, and it was confirmed that there were additional complaints received with similar sn (b)(4) for the same or related failure mode. the customer stated that there was no patient involvement.'

In [None]:
text_2 = "the patient contacted animas alleging that the subject pump auto suspends when he wakes up in the morning. there was no mention of physical damage to the pump. there was no reported patient impact associated with this complaint. (b)(4): all buttons responded to presses normally. the keypad was removed and no damage was found to the button contacts. unrelated to the complaint, evaluation revealed the internal clock battery on the pcb board had failed. the pump would not retain the user programmed date and time settings upon removal of the primary aa battery. when a new aa battery is inserted the pump displays the default date and time which must be manually confirmed (or reset) by the user in order to proceed. the pump has not been returned to animas for evaluation. animas has conducted a review of the device history record for this pump and confirmed that it was operating within required specifications at the time of release. if the device is returned, an evaluation shall be completed and a supplemental report will be filed. no conclusions can be made at this time. (b)(4)."

In [None]:
text_3 = "it was reported that the unit was knocked over and subsequently declared multiple errors indicative of a serial peripheral interface failure. there was no patient involvement. the manufacturer's remote service technician performed troubleshooting with the customer. the customer reported that a wire had broken from the battery and the wire was soldered back together. the technician advised the customer against re-soldering components and to replace the battery if damaged. the technician also recommended that the customer replace the data acquisition (da) ribbon cable, followed by the flow sensor and central processing unit (cpu) board. date of event: (b)(6) 2020. date of report: 20apr2020."

In [None]:
text_4 = "the service report shows the customer reported that the 840 ventilator stopped cycling while in use on a pt. the pt was not harmed or injured as a result of the event. the nellcor puritan bennett customer support engineer (cse) inspected the device and could not duplicate the alleged event. the unit passed extended self-testing and no parts were replaced. it is not verified that the vent was inoperable, and that a malfunction occurred."

In [None]:
text_5 = "the customer reported that the vela ventilator alarmed low pip (peak inspiratory pressure), low ve (minute ventilation) , xdcr (transducer) fault. the customer confirmed that there was no patient involvement associated with the reported event. vyaire medical file identification: (b)(4). the customer reported that they will not return the defective pcb for evaluation. therefore, no root cause could be determined . vyaire medical will submit a supplemental report in accordance with 21 cfr section 803.56 if additional information was received."

In [None]:
process_keywordsdict_per_text(text_1)

In [None]:
process_keywordsdict_per_text(text_2)

In [None]:
process_keywordsdict_per_text(text_3)

In [None]:
process_keywordsdict_per_text(text_4)

In [None]:
process_keywordsdict_per_text(text_5)