In [None]:
# Import the neccesary libraries 

import pandas as pd
import re
from tqdm import tqdm
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('all')

In [None]:
# Path to the CSV data file - mounted from my Google Drive

data_path = "/content/drive/MyDrive/Task_data (1) (3).csv"

In [None]:
# Function to convert the CSV file to a dataframe. It takes the path to the data file as parameter and returns a dataframe of the data

def csv_to_df(path):
  datadf = pd.read_csv(path, low_memory=False)
  return datadf

In [None]:
text_description = "the customer reported that the vela ventilator alarmed low pip (peak inspiratory pressure), low ve (minute ventilation) , xdcr (transducer) fault. the customer confirmed that there was no patient involvement associated with the reported event. vyaire medical file identification: (b)(4). the customer reported that they will not return the defective pcb for evaluation. therefore, no root cause could be determined . vyaire medical will submit a supplemental report in accordance with 21 cfr section 803.56 if additional information was received."

In [None]:
def extract_keyword_corpus(text_desr):

  device_problem = []
  patient_problem = []
  other_problem = []
  
  global final_device_problem_keyphrases, final_patient_problem_keyphrases, final_other_problem_keyphrases
  
  text_list = text_desr.split('.')
  
  for text in text_list:
    if bool(re.search("device", text, re.IGNORECASE)):
      device_problem.append(text)
    
    elif bool(re.search("patient", text, re.IGNORECASE)):
      patient_problem.append(text)
      
    else:
      other_problem.append(text)
  
  if len(device_problem) > 0:
    device_problem_keyphrases = [d_problem.split('device')[1] for d_problem in device_problem]
    device_problem_keyphrases = [device_problem_keyphrases.split(',') for device_problem_keyphrases in device_problem_keyphrases]
    final_device_problem_keyphrases = [item.lstrip() for sublist in device_problem_keyphrases for item in sublist]
  
  elif len(device_problem) == 0:
    final_device_problem_keyphrases = device_problem
  
  if len(patient_problem) > 0:
    patient_problem_keyphrases = [p_problem.split('patient')[1] for p_problem in patient_problem]
    patient_problem_keyphrases = [patient_problem_keyphrases.split(',') for patient_problem_keyphrases in patient_problem_keyphrases]
    final_patient_problem_keyphrases = [item.lstrip() for sublist in patient_problem_keyphrases for item in sublist]
  
  elif len(patient_problem) == 0:
    final_patient_problem_keyphrases = patient_problem
  
  if len(other_problem) > 0:
    other_problem_keyphrases = [o_problem.split(',') for o_problem in other_problem]
    final_other_problem_keyphrases = [item.lstrip() for sublist in other_problem_keyphrases for item in sublist]
  
  elif len(other_problem) == 0:
    final_other_problem_keyphrases = other_problem
  
  return final_device_problem_keyphrases,  final_patient_problem_keyphrases, final_other_problem_keyphrases

In [None]:
# Initialize WordNetLemmatizer for the NLP processing

wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
# Create stop words from fundamental English stop words
# The stopwords.txt file is a collection of English words that should be eliminated because they do not add any important meaning

stopwords = set(w.rstrip() for w in open('/content/drive/MyDrive/stopwords.txt'))

# Adding more stopwords specific to this problem - as seen in the raw data
# This would be iterately expanded to optimize the performance of the tokenizer

stopwords = stopwords.union({'wa','reported'})

In [None]:
# Tokenization and Text pre-processing function
# This function tokenizes each sentence in the problem list provided, and returns a new list of tokens and any numeric value found 

def my_tokenizer(sentence):
  
  eliminate_words = [',',"'",'.','(',')','{','}',':','e.g',';', '@', '{}', '()'] # This can also be increased to 
  sentence = sentence.lower() # downcase
  tokens = nltk.tokenize.word_tokenize(sentence) # split string into words (tokens)
  tokens = [t for t in tokens if len(t) > 2 or len(t)==1]   # remove short words, they're probably not useful
  tokens = [t for t in tokens if t not in eliminate_words] # Remove those weird characters in the eliminate_words list
  tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
  tokens = [t for t in tokens if t not in stopwords] # remove stopwords
  digits = [t for t in tokens if any(c.isdigit() for c in t)] # get the digits seperately (probably these are the scores)
  
  return " ".join(tokens), digits

In [None]:
# This function takes the text description as a parameter, passes it to the extract_keyword_corpus function.
# The returned values from the extract_keyword_corpus function are further processed to get a dictionary data structure for the required features
# This function returns a dictionary where the key-values are the six columns to be populated in the dataframe

def extract_keywords_dict(text_desr):
  
  keyword_corpus = extract_keyword_corpus(text_desr) #The extract_keyword_corpus function is called here

  keyword_corpus_device = keyword_corpus[0]
  keyword_corpus_patient = keyword_corpus[1]
  keyword_corpus_other = keyword_corpus[2]

  keywords_dict = {}
  
  device_keywords = []
  device_score = []
  patient_keywords = []
  patient_score = []
  other_keywords = []
  other_score = []
  
  if len(keyword_corpus_device) > 0:
    for sent_d in keyword_corpus_device:
      sentence_tokens_d = my_tokenizer(sent_d) # The tokenizer function is called here
      device_keywords.append(sentence_tokens_d[0])
      if len(sentence_tokens_d[1]) > 1:
        joined_digits = " ".join(sentence_tokens_d[1])
        device_score.append(joined_digits)
      elif len(sentence_tokens_d[1]) == 1:
        digits = sentence_tokens_d[1][0]
        device_score.append(digits)
  
  elif len(keyword_corpus_patient) > 0:
    for sent_p in keyword_corpus_patient:
      sentence_tokens_p = my_tokenizer(sent_p) # The tokenizer function is called here
      patient_keywords.append(sentence_tokens_p[0])
      if len(sentence_tokens_p[1]) > 1:
        joined_digits = " ".join(sentence_tokens_p[1])
        patient_score.append(joined_digits)
      elif len(sentence_tokens_p[1]) == 1:
        digits = sentence_tokens_p[1][0]
        patient_score.append(digits)
  
  elif len(keyword_corpus_other) > 0:
    for sent_o in keyword_corpus_other:
      sentence_tokens_o = my_tokenizer(sent_o) # The tokenizer function is called here
      other_keywords.append(sentence_tokens_o[0])
      if len(sentence_tokens_o[1]) > 1:
        joined_digits = " ".join(sentence_tokens_o[1])
        other_score.append(joined_digits)
      elif len(sentence_tokens_o[1]) == 1:
        digits = sentence_tokens_o[1][0]
        other_score.append(digits)
  
  keywords_dict["keywords (Device Problems)"] = device_keywords
  keywords_dict["keywords (Device Problems)_score"] = device_score
  keywords_dict["keywords (Patient Problems)"] = patient_keywords
  keywords_dict["keywords (Patient Problems)_score"] = patient_score
  keywords_dict["keywords others (interesting ones)"] = other_keywords
  keywords_dict["keywords others (interesting ones)_score"] = other_score

  return keywords_dict

In [None]:
# Call the csv_to_df function that opens the CSV file into a dataframe

data_df = csv_to_df(data_path)


In [None]:
# Call the process_df function

processed_data_df = process_df(data_df)

In [None]:
# Show the first 20 rows of the processed data frame - for the whole data frame

processed_data_df.head(20)

In [None]:
# Save the processed data frame in CSV format - change the file name not to override the unprocessed data - for the whole dataframe

def save_processeddf(processed_datadf):
  processed_datadf.to_csv("/content/drive/MyDrive/processed_Task_data (1) (3).csv", encoding='utf-8', index=None)

In [None]:
# Call the save_processeddf function - for the whole dataframe

save_processeddf(processed_data_df)

In [None]:
#Test script here, using the same texts for solutions 2 and 3 to see the quality of the processed data returned. This processed data 
# is expected to be used to populate the dataframe's columns requested

In [None]:
text_1 = '(b)(4). this reported event and subsequent repairs were investigated through the service repair process. failure data and parts-used information were reviewed for the sap and track wise files and found relevant to the service repair. a review of the device service history record was performed from the date of manufacture to the date corresponding to this service notification number. the database showed no quality notifications were opened for the device. a review of the device history record in sap for sn (b)(4) was performed from the date of the manufacture to date of the release of product, which confirmed that this device was not involved in a production failure, and product was returned for servicing which correlates to the customer reported issue. a trackwise complaint history review was completed, and it was confirmed that there were additional complaints received with similar sn (b)(4) for the same or related failure mode. the customer stated that there was no patient involvement.'

In [None]:
text_2 = "the patient contacted animas alleging that the subject pump auto suspends when he wakes up in the morning. there was no mention of physical damage to the pump. there was no reported patient impact associated with this complaint. (b)(4): all buttons responded to presses normally. the keypad was removed and no damage was found to the button contacts. unrelated to the complaint, evaluation revealed the internal clock battery on the pcb board had failed. the pump would not retain the user programmed date and time settings upon removal of the primary aa battery. when a new aa battery is inserted the pump displays the default date and time which must be manually confirmed (or reset) by the user in order to proceed. the pump has not been returned to animas for evaluation. animas has conducted a review of the device history record for this pump and confirmed that it was operating within required specifications at the time of release. if the device is returned, an evaluation shall be completed and a supplemental report will be filed. no conclusions can be made at this time. (b)(4)."

In [None]:
text_3 = "it was reported that the unit was knocked over and subsequently declared multiple errors indicative of a serial peripheral interface failure. there was no patient involvement. the manufacturer's remote service technician performed troubleshooting with the customer. the customer reported that a wire had broken from the battery and the wire was soldered back together. the technician advised the customer against re-soldering components and to replace the battery if damaged. the technician also recommended that the customer replace the data acquisition (da) ribbon cable, followed by the flow sensor and central processing unit (cpu) board. date of event: (b)(6) 2020. date of report: 20apr2020."

In [None]:
text_4 = "the service report shows the customer reported that the 840 ventilator stopped cycling while in use on a pt. the pt was not harmed or injured as a result of the event. the nellcor puritan bennett customer support engineer (cse) inspected the device and could not duplicate the alleged event. the unit passed extended self-testing and no parts were replaced. it is not verified that the vent was inoperable, and that a malfunction occurred."

In [None]:
text_5 = "the customer reported that the vela ventilator alarmed low pip (peak inspiratory pressure), low ve (minute ventilation) , xdcr (transducer) fault. the customer confirmed that there was no patient involvement associated with the reported event. vyaire medical file identification: (b)(4). the customer reported that they will not return the defective pcb for evaluation. therefore, no root cause could be determined . vyaire medical will submit a supplemental report in accordance with 21 cfr section 803.56 if additional information was received."

In [None]:
extract_keywords_dict(text_1)

{'keywords (Device Problems)': ['service history record performed date manufacture date corresponding this service notification',
  '',
  'history record sap 4 performed date manufacture date release product',
  'confirmed this'],
 'keywords (Device Problems)_score': ['4'],
 'keywords (Patient Problems)': [],
 'keywords (Patient Problems)_score': [],
 'keywords others (interesting ones)': [],
 'keywords others (interesting ones)_score': []}

In [None]:
extract_keywords_dict(text_2)

{'keywords (Device Problems)': ['history record this pump confirmed operating required specification time release',
  'returned',
  'evaluation completed supplemental report filed'],
 'keywords (Device Problems)_score': [],
 'keywords (Patient Problems)': [],
 'keywords (Patient Problems)_score': [],
 'keywords others (interesting ones)': [],
 'keywords others (interesting ones)_score': []}

In [None]:
extract_keywords_dict(text_3)

{'keywords (Device Problems)': [],
 'keywords (Device Problems)_score': [],
 'keywords (Patient Problems)': ['involvement'],
 'keywords (Patient Problems)_score': [],
 'keywords others (interesting ones)': [],
 'keywords others (interesting ones)_score': []}

In [None]:
extract_keywords_dict(text_4)

{'keywords (Device Problems)': ['duplicate alleged event'],
 'keywords (Device Problems)_score': [],
 'keywords (Patient Problems)': [],
 'keywords (Patient Problems)_score': [],
 'keywords others (interesting ones)': [],
 'keywords others (interesting ones)_score': []}

In [None]:
extract_keywords_dict(text_5)

{'keywords (Device Problems)': [],
 'keywords (Device Problems)_score': [],
 'keywords (Patient Problems)': ['involvement associated event'],
 'keywords (Patient Problems)_score': [],
 'keywords others (interesting ones)': [],
 'keywords others (interesting ones)_score': []}