In [None]:
# Import the neccesary libraries 

import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('all')

In [21]:
# Path to the CSV data file - mounted on my Google Drive

data_path = "/content/drive/MyDrive/Task_data (1) (3).csv"

In [22]:
# Function to convert the CSV file to a dataframe. It takes the path to the data file as parameter and returns a dataframe of the data

def csv_to_df(path):
  datadf = pd.read_csv(path, low_memory=False)
  return datadf

In [37]:
# This function takes a text description as a parameter and returns three lists which categorise the content of the text description into device
# patient, and any other one

def extract_keyword_corpus(text_desr):

  device_problem = []
  patient_problem = []
  other_problem = []
  
  global final_device_problem_keyphrases, final_patient_problem_keyphrases, final_other_problem_keyphrases
  
  text_list = text_desr.split('.')
  
  for text in text_list:
    if bool(re.search("device", text, re.IGNORECASE)):
      device_problem.append(text)
    
    elif bool(re.search("patient", text, re.IGNORECASE)):
      patient_problem.append(text)
      
    else:
      other_problem.append(text)
  
  if len(device_problem) > 0:
    device_problem_keyphrases = [d_problem.split('device')[1] for d_problem in device_problem]
    device_problem_keyphrases = [device_problem_keyphrases.split(',') for device_problem_keyphrases in device_problem_keyphrases]
    final_device_problem_keyphrases = [item.lstrip() for sublist in device_problem_keyphrases for item in sublist]
  
  elif len(device_problem) == 0:
    final_device_problem_keyphrases = device_problem
  
  if len(patient_problem) > 0:
    patient_problem_keyphrases = [p_problem.split('patient')[1] for p_problem in patient_problem]
    patient_problem_keyphrases = [patient_problem_keyphrases.split(',') for patient_problem_keyphrases in patient_problem_keyphrases]
    final_patient_problem_keyphrases = [item.lstrip() for sublist in patient_problem_keyphrases for item in sublist]
  
  elif len(patient_problem) == 0:
    final_patient_problem_keyphrases = patient_problem
  
  if len(other_problem) > 0:
    other_problem_keyphrases = [o_problem.split(',') for o_problem in other_problem]
    final_other_problem_keyphrases = [item.lstrip() for sublist in other_problem_keyphrases for item in sublist]
  
  elif len(other_problem) == 0:
    final_other_problem_keyphrases = other_problem
  
  return final_device_problem_keyphrases,  final_patient_problem_keyphrases, final_other_problem_keyphrases

In [32]:
text_desr = '(b)(4). this reported event and subsequent repairs were investigated through the service repair process. failure data and parts-used information were reviewed for the sap and track wise files and found relevant to the service repair. a review of the device service history record was performed from the date of manufacture to the date corresponding to this service notification number. the database showed no quality notifications were opened for the device. a review of the device history record in sap for sn (b)(4) was performed from the date of the manufacture to date of the release of product, which confirmed that this device was not involved in a production failure, and product was returned for servicing which correlates to the customer reported issue. a trackwise complaint history review was completed, and it was confirmed that there were additional complaints received with similar sn (b)(4) for the same or related failure mode. the customer stated that there was no patient involvement.'

In [25]:
# Initialize WordNetLemmatizer for the NLP processing

wordnet_lemmatizer = WordNetLemmatizer()

In [26]:
# Create stop words from fundamental English stop words
# The stopwords.txt file is a collection of English words that should be eliminated because they do not add any important meaning

stopwords = set(w.rstrip() for w in open('/content/drive/MyDrive/stopwords.txt'))

# Adding more stopwords specific to this problem - as seen in the raw data
# This would be iterately expanded to optimize the performance of the tokenizer

stopwords = stopwords.union({'wa','reported'})

In [27]:
# Tokenization and Text pre-processing function
# This function tokenizes each sentence in the problem list provided, and returns a new list of tokens and any numeric value found 

def my_tokenizer(sentence):
  
  eliminate_words = [',',"'",'.','(',')','{','}',':','e.g',';', '@', '{}', '()'] # This can also be increased to 
  sentence = sentence.lower() # downcase
  tokens = nltk.tokenize.word_tokenize(sentence) # split string into words (tokens)
  tokens = [t for t in tokens if len(t) > 2 or len(t)==1]   # remove short words, they're probably not useful
  tokens = [t for t in tokens if t not in eliminate_words] # Remove those weird characters in the eliminate_words list
  tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
  tokens = [t for t in tokens if t not in stopwords] # remove stopwords
  digits = [t for t in tokens if any(c.isdigit() for c in t)] # get the digits seperately (probably these are the scores)
  
  return " ".join(tokens), digits

In [28]:
# This function takes the text description as a parameter, passes it to the extract_keyword_corpus function.
# The returned values from the extract_keyword_corpus function are further processed to get a dictionary data structure for the required features
# This function returns a dictionary where the key-values are the six columns to be populated in the dataframe

def extract_keywords_dict(text_desr):
  
  keyword_corpus = extract_keyword_corpus(text_desr) #The extract_keyword_corpus function is called here

  keyword_corpus_device = keyword_corpus[0]
  keyword_corpus_patient = keyword_corpus[1]
  keyword_corpus_other = keyword_corpus[2]

  keywords_dict = {}
  
  device_keywords = []
  device_score = []
  patient_keywords = []
  patient_score = []
  other_keywords = []
  other_score = []
  
  if len(keyword_corpus_device) > 0:
    for sent_d in keyword_corpus_device:
      sentence_tokens = my_tokenizer(sent_d) # The tokenizer function is called here
      device_keywords.append(sentence_tokens[0])
      if len(sentence_tokens[1]) > 1:
        joined_digits = " ".join(sentence_tokens[1])
        device_score.append(joined_digits)
      elif len(sentence_tokens[1]) == 1:
        digits = sentence_tokens[1][0]
        device_score.append(digits)
  
  elif len(keyword_corpus_patient) > 0:
    for sent_p in keyword_corpus_patient:
      sentence_tokens = my_tokenizer(sent_p) # The tokenizer function is called here
      patient_keywords.append(sentence_tokens[0])
      if len(sentence_tokens[1]) > 1:
        joined_digits = " ".join(sentence_tokens[1])
        patient_score.append(joined_digits)
      elif len(sentence_tokens[1]) == 1:
        digits = sentence_tokens[1][0]
        patient_score.append(digits)
  
  elif len(keyword_corpus_other) > 0:
    for sent_o in keyword_corpus_other:
      sentence_tokens = my_tokenizer(sent_o) # The tokenizer function is called here
      other_keywords.append(sentence_tokens[0])
      if len(sentence_tokens[1]) > 1:
        joined_digits = " ".join(sentence_tokens[1])
        other_score.append(joined_digits)
      elif len(sentence_tokens[1]) == 1:
        digits = sentence_tokens[1][0]
        other_score.append(digits)
  
  keywords_dict["keywords (Device Problems)"] = device_keywords
  keywords_dict["keywords (Device Problems)_score"] = device_score
  keywords_dict["keywords (Patient Problems)"] = patient_keywords
  keywords_dict["keywords (Patient Problems)_score"] = patient_score
  keywords_dict["keywords others (interesting ones)"] = other_keywords
  keywords_dict["keywords others (interesting ones)_score"] = other_score

  return keywords_dict

In [50]:
#Function for process the data frame as required

def process_df(datadf):

  # row index starts from zero
  idx = 0

  while idx <= len(datadf)-1:

    for text in datadf["TEXT DESCRIPTION"]:


      keywords_to_use = extract_keywords_dict(text) # The extract_keywords_dict function is called here
      
      try:
        datadf.iloc[[idx],[3]] = keywords_to_use.get("keywords (Device Problems)", "Not available")
        datadf.iloc[[idx], [4]] = keywords_to_use.get("keywords (Device Problems)_score", "Not available")
        datadf.iloc[[idx], [5]] = keywords_to_use.get("keywords (Patient Problems)", "Not available")
        datadf.iloc[[idx], [6]] = keywords_to_use.get("keywords (Patient Problems)_score", "Not available")
        datadf.iloc[[idx], [7]] = keywords_to_use.get("keywords others (interesting ones)", "Not available")
        datadf.iloc[[idx], [8]] = keywords_to_use.get("keywords others (interesting ones)_score", "Not available")

      except ValueError: # This ValueError is exempted because we are passing lists of different lengths, and they contain different num of items to update the dataframe
        continue

      idx += 1

  return datadf

In [52]:
# Call the csv_to_df function that opens the CSV file into a dataframe

data_df = csv_to_df(data_path)

copy_data_df = data_df.copy(deep = True)

In [53]:

part_copy_data_df = copy_data_df[:20]

In [54]:
part_copy_data_df.head(5)

Unnamed: 0,RN,RK,TEXT DESCRIPTION,keywords (Device Problems),keywords (Device Problems)_score,keywords (Patient Problems),keywords (Patient Problems)_score,keywords others (interesting ones),keywords others (interesting ones)_score,MDR PP,NLP PP
0,2016493-2021-17616,11224049,it was reported that the device failed prevent...,,,,,,,['F27' 'E2403' 'A0509'],"['A1301', 'F27', 'E2403']"
1,2016493-2020-85916,11101248,"fails calibration. replaced linear sensor, fro...",,,,,,,['F27' 'A0401' 'A0404' 'A040502' 'A25' 'A0509'...,"['F27', 'A0404', 'A070903']"
2,2031642-2020-00098,9565035,the customer reported a noise was heard at the...,,,,,,,['A0508'],"['A0508', 'E2330', 'A090201']"
3,2016493-2020-39626,10835372,(b)(4). this reported event and subsequent rep...,,,,,,,['F27' 'A0401' 'A0404' 'F24' 'E2401' 'A26' 'A0...,"['F27', 'A0404', 'A0401']"
4,2016493-2020-42584,10858883,(b)(4). this reported event and subsequent rep...,,,,,,,['F27' 'A0404' 'A2305' 'A0509'],"['F27', 'A0404', 'A0401']"


In [None]:
processed_data_df = process_df(part_copy_data_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [None]:
# Call the process_df function

processed_data_df = process_df(data_df)

In [91]:
# Show the first 20 rows of the processed data frame

processed_data_df.head(20)

In [61]:
# Save the processed data frame in CSV format - change the file name not to override the unprocessed data

def save_processeddf(processed_datadf):
  processed_datadf.to_csv("/content/drive/MyDrive/processed_Task_data (1) (3).csv", encoding='utf-8', index=None)

In [73]:
# Call the save_processeddf function

save_processeddf(processed_data_df)