In [None]:
%pip install nltk
%pip install sentence_transformers
%pip install pubchempy
%pip install chemdataextractor
import nltk
%nltk.download('wordnet')
%nltk.download('omw-1.4')
%nltk.download('stopwords')
%nltk.download('punkt')
%cde data download

In [None]:
import datetime
import pandas as pd
import numpy as np
import re
import os
import csv
import random
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation
from random import randint
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer, util
from chemdataextractor import Document
import pubchempy as pubc


In [41]:
def write_to_csv(values : list, filepath : str) -> None:
    '''Function to write output on the csv file.
    
    Parameters
    ----------
    values : list
      List of values to be written on the csv
    filepath : str
      Path of the csv file
    
    Return
    ------
    None
    
    Example
    -------
    >>> write_to_csv(['value1', 'value2', 'value3'], 'output.csv')
    This will append the row ['value1', 'value2', 'value3'] to 'output.csv'.
    '''
    if not isinstance(values, list) or not values:
        raise ValueError("The 'values' parameter must be a non-empty list.")
    
    try:
        with open(filepath, 'a', newline='') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerow(values)
    except IOError as e:
        raise IOError(f"An error occurred while writing to the file {filepath}: {e}")

In [42]:
def preprocessing(text : str) -> str:
   '''
    Clean up the input data by applying preprocessing steps.

    Parameters
    ----------
    text : str
        The input string that will be processed.

    Returns
    -------
    str
        The preprocessed string after applying the cleanup operations.
    '''
   tokens = word_tokenize(text)
   tokens = [w for w in tokens if w not in punctuation and not w.isdigit() and not len(w) < 3]
   stop_words = stopwords.words ('english')
   tweet_without_stopwords = [t for t in tokens if t not in stop_words]
   text = " ".join (tweet_without_stopwords)
   return text

In [43]:
def cosine_similarity_score(s1,s2, vec):
    '''
    Calculate the cosine similarity score between two input strings.

    Parameters
    ----------
    text : str
        Input string on which preprocessing should be applied.

    Returns
    -------
    str
        The preprocessed string. 
    '''
    s1 = preprocessing(s1)
    s2 = preprocessing(s2)
    sentences = [s1,s2]
    sentence_to_vec = vec.fit_transform(sentences)
    sentence_to_vec_arr = sentence_to_vec.toarray()
    sim_score = cosine_similarity(sentence_to_vec_arr)
    return round(sim_score[0][1],3)

In [44]:
def transformer_similarity_score(s1,s2, model):
    '''
    Computes the semantic similarity score between two input words/strings using a transformer model.

    Args:
        s1 (str): The first input word/string.
        s2 (str): The second input word/string.
        model (obj): The transformer model used to encode the strings into embeddings. 
            This model should have an `encode` method that returns tensor embeddings.

    Returns:
        float: The cosine similarity score between the embeddings of the two input word/strings,
            rounded to three decimal places. The score ranges from 0.0 (completely dissimilar)
            to 1.0 (completely similar).
    '''
    embeddings1 = model.encode(s1, convert_to_tensor=True)
    embeddings2 = model.encode(s2, convert_to_tensor=True)
    semantic_similarity_scores = util.cos_sim(embeddings1, embeddings2)
    return round(float(semantic_similarity_scores[0][0]),3)

In [96]:
def swap_word(input_file_path, word_count, similarity_tsd):
    pmr = 'swap_word'
    mr = f'{pmr}_{word_count}_{similarity_tsd}'
    mod_input_dir = f'../modified_input/{pmr}'
    log_dir = f'../log/{pmr}'
    if not os.path.exists(mod_input_dir):
        os.makedirs(mod_input_dir)
    output_file_path = f'{mod_input_dir}/{mr}.csv'
    log_file_name = mr+'-{date:%Y-%m-%d_%H:%M:%S}'.format(date=datetime.datetime.now())
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    log_file_path = f'{log_dir}/{log_file_name}.csv'
    input_test_df = pd.read_csv(input_file_path)
    input_text_series = input_test_df['text']
    stop_words = stopwords.words ('english')
    write_to_csv(['actual_text', 'modified_text', 'swapped_word_pos', 'word_swapped', 'swapped_word'],log_file_path)
    model = SentenceTransformer('TimKond/S-PubMedBert-MedQuAD')
    vectorizer = TfidfVectorizer()
    sentence_count = 0
    while sentence_count < len(input_text_series)-1:
        count = 0
        text = input_text_series[sentence_count]
        input_tokens = word_tokenize(text)
        tokens_length = len(input_tokens)
        while count < word_count:
            swap_pos = randint(word_count, tokens_length-1)
            word_to_swap = input_tokens[swap_pos]
            if len(word_to_swap) > 3:
                word_to_swap_lemma = WordNetLemmatizer().lemmatize(word_to_swap)
                if word_to_swap_lemma not in punctuation:
                    if word_to_swap_lemma not in stop_words and not word_to_swap_lemma.isdigit():
                        synonym_net = wordnet.synsets(word_to_swap_lemma)
                        if len(synonym_net) > 1:
                            swap_word = synonym_net[0].lemmas()[0].name()
                            if transformer_similarity_score(word_to_swap, swap_word, model) > similarity_tsd:
                                if word_to_swap  != swap_word and word_to_swap_lemma  != swap_word:
                                    input_tokens[swap_pos] = swap_word
                                    mod_text = " ".join (input_tokens)
                                    write_to_csv([text, mod_text, swap_pos, word_to_swap, swap_word],log_file_path)
                                    count = count + 1
        mod_text = " ".join (input_tokens)
        if cosine_similarity_score(text, mod_text, vectorizer)>similarity_tsd:
            input_test_df.at[sentence_count, 'text'] = mod_text
            sentence_count = sentence_count + 1
    input_test_df.to_csv(output_file_path)    

In [85]:
def add_word(input_file_path, word_count, word_length, operation_type):
    pmr = 'add_word'
    mr = f'{pmr}_{word_count}_{word_length}_{operation_type}'
    mod_input_dir = f'../modified_input/{pmr}'
    log_dir = f'../log/{pmr}'
    if not os.path.exists(mod_input_dir):
        os.makedirs(mod_input_dir)
    output_file_path = f'{mod_input_dir}/{mr}.csv'
    log_file_name = mr+'-{date:%Y-%m-%d_%H:%M:%S}'.format(date=datetime.datetime.now())
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    log_file_path = f'{log_dir}/{log_file_name}.csv'
    input_test_df = pd.read_csv(input_file_path)
    input_text_series = input_test_df['text']
    stop_words = stopwords.words ('english')
    model = SentenceTransformer('TimKond/S-PubMedBert-MedQuAD')
    write_to_csv(['actual_text', 'modified_text', 'word_pos', 'added_word'],log_file_path)
    for input_text_index in range(len(input_text_series)):
        count = 0
        words_added = []
        add_positions = []
        text = input_text_series[input_text_index]
        input_tokens = word_tokenize(text)
        tokens_length = len(input_tokens)
        while count < word_count:
            add_pos = randint(count, tokens_length-1)
            word_to_add = input_tokens[add_pos]
            if word_to_add not in punctuation:
                if word_to_add not in stop_words and not word_to_add.isdigit():
                    if len(word_to_add) > word_length:
                        if operation_type == 'existing':
                            input_tokens.insert(add_pos+1, word_to_add)
                            words_added.append(word_to_add)
                            add_positions.append(add_pos)
                            count = count + 1
                        elif operation_type == 'new':
                            synonym_net = wordnet.synsets(word_to_add)
                            if len(synonym_net) > 1:
                                add_word = synonym_net[0].lemmas()[0].name()
                                if transformer_similarity_score(word_to_add, word_to_add, model) > 0.90:
                                    input_tokens.insert(add_pos+1, add_word)
                                    words_added.append(add_word)
                                    add_positions.append(add_pos)
                                    count = count + 1
                        else:
                            return 'Incorrect operation type'                         
        mod_text = " ".join (input_tokens)
        input_test_df.at[input_text_index, 'text'] = mod_text
        write_to_csv([text, mod_text, add_positions, words_added],log_file_path)
    input_test_df.to_csv(output_file_path)   

In [142]:
def mistake_word(input_file_path, word_count, word_length, character_size, operation_type):
    pmr = 'add_word'
    mr = f'{pmr}_{word_count}_{word_length}_{character_size}_{operation_type}'
    mod_input_dir = f'../modified_input/{pmr}'
    log_dir = f'../log/{pmr}'
    if not os.path.exists(mod_input_dir):
        os.makedirs(mod_input_dir)
    output_file_path = f'{mod_input_dir}/{mr}.csv'
    log_file_name = mr+'-{date:%Y-%m-%d_%H:%M:%S}'.format(date=datetime.datetime.now())
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    log_file_path = f'{log_dir}/{log_file_name}.csv'
    input_test_df = pd.read_csv(input_file_path)
    input_text_series = input_test_df['text']
    stop_words = stopwords.words ('english')
    write_to_csv(['actual_text', 'modified_text', 'word_pos', 'correct_word', 'misspelled_word'],log_file_path)
    for input_text_index in range(len(input_text_series)):
        count = 0
        words_misspelled = []
        misspelled_positions = []
        correct_words = []
        text = input_text_series[input_text_index]
        input_tokens = word_tokenize(text)
        tokens_length = len(input_tokens)
        while count < word_count:
            misspelled_pos = randint(count, tokens_length-1)
            word_to_misspell = input_tokens[misspelled_pos]
            if len(word_to_misspell) > word_length:
                if word_to_misspell not in punctuation:
                    if word_to_misspell not in stop_words and not word_to_misspell.isdigit():
                        for c in range(character_size):
                            list_of_chars = list(word_to_misspell)
                            misspelled_char_pos = randint(0, len(list_of_chars)-1)
                            if operation_type == 'remove':
                                list_of_chars.remove(list_of_chars[misspelled_char_pos])
                            if operation_type == 'change':
                                randomchar = chr(random.randint(ord('a'), ord('z')))
                                list_of_chars.insert(misspelled_char_pos, randomchar)
                            if operation_type == 'add':
                                list_of_chars.insert(misspelled_char_pos+1, list_of_chars[misspelled_char_pos])
                            correct_words.append(word_to_misspell)
                            word_to_misspell = "".join (list_of_chars)
                            input_tokens[misspelled_pos] = word_to_misspell
                            words_misspelled.append(word_to_misspell)
                            misspelled_positions.append(misspelled_pos)
                            count = count + 1
        mod_text = " ".join (input_tokens)
        input_test_df.at[input_text_index, 'text'] = mod_text
        write_to_csv([text, mod_text, misspelled_positions, correct_words, words_misspelled],log_file_path)
    input_test_df.to_csv(output_file_path)   

In [7]:
def remove_word(input_file_path, word_count, word_length):
    pmr = 'remove_word'
    mr = f'{pmr}_{word_count}_{word_length}'
    mod_input_dir = f'../modified_input/{pmr}'
    log_dir = f'../log/{pmr}'
    if not os.path.exists(mod_input_dir):
        os.makedirs(mod_input_dir)
    output_file_path = f'{mod_input_dir}/{mr}.csv'
    log_file_name = mr+'-{date:%Y-%m-%d_%H:%M:%S}'.format(date=datetime.datetime.now())
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    log_file_path = f'{log_dir}/{log_file_name}.csv'
    input_test_df = pd.read_csv(input_file_path)
    input_text_series = input_test_df['text']
    stop_words = stopwords.words ('english')
    model = SentenceTransformer('TimKond/S-PubMedBert-MedQuAD')
    write_to_csv(['actual_text', 'modified_text', 'word_pos', 'added_word'],log_file_path)
    sentence_count = 0
    while sentence_count < len(input_text_series)-1:
        count = 0
        words_removed = []
        remove_positions = []
        text = input_text_series[sentence_count]
        input_tokens = word_tokenize(text)
        tokens_length = len(input_tokens)
        while count < word_count:
            tokens_length = len(input_tokens)
            remove_pos = randint(count, tokens_length-1)
            word_to_remove = input_tokens[remove_pos]
            if word_to_remove not in punctuation:
                if word_to_remove not in stop_words and not word_to_remove.isdigit():
                    if len(word_to_remove) > word_length:
                        input_tokens.remove(word_to_remove)
                        words_removed.append(word_to_remove)
                        remove_positions.append(remove_pos) 
                        count = count + 1                        
        mod_text = " ".join (input_tokens)
        if transformer_similarity_score(text, mod_text, model) > 0.90:
            input_test_df.at[sentence_count, 'text'] = mod_text
            write_to_csv([text, mod_text, remove_positions, words_removed],log_file_path)
            sentence_count = sentence_count + 1
    input_test_df.to_csv(output_file_path)   

In [27]:
def demographic_change(input_file_path, demographic_type, operation_type):
    pmr = 'demographic_change'
    mr = f'{pmr}_{demographic_type}_{operation_type}'
    mod_input_dir = f'../modified_input/{pmr}'
    log_dir = f'../log/{pmr}'
    if not os.path.exists(mod_input_dir):
        os.makedirs(mod_input_dir)
    output_file_path = f'{mod_input_dir}/{mr}.csv'
    log_file_name = mr+'-{date:%Y-%m-%d_%H:%M:%S}'.format(date=datetime.datetime.now())
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    log_file_path = f'{log_dir}/{log_file_name}.csv'
    input_test_df = pd.read_csv(input_file_path)
    input_text_series = input_test_df['text']
    write_to_csv(['actual_text', 'modified_text', 'is_mofified'],log_file_path)
    for input_text_index in range(len(input_text_series)):
        text = input_text_series[input_text_index]
        mod_text = text
        modified = 'No'
        if operation_type == 'swap':
            if demographic_type == 'gender':
                if re.search(r'\bhe\b', text) or re.search(r'\bhim\b', text) or re.search(r'\bhis\b', text) or re.search(r'\bhimself\b', text) or re.search(r'\sex m\b', text):
                    mod_text = re.sub(r'\bsex m\b', 'sex f', text)
                    mod_text = re.sub(r'\bhe\b', 'she', text)
                    mod_text = re.sub(r'\bhim\b', 'her', mod_text)
                    mod_text = re.sub(r'\bhimself\b', 'herself', mod_text)
                    mod_text = re.sub(r'\bhis\b', 'hers', mod_text)
                    text = mod_text
                    modified = 'Yes'
                if re.search(r'\bshe\b', text) or re.search(r'\bher\b', text) or re.search(r'\bhers\b', text) or re.search(r'\bherself\b', text) or re.search(r'\sex f\b', text):
                    mod_text = re.sub(r'\bsex f\b', 'sex m', text)
                    mod_text = re.sub(r'\bshe\b', 'he', text)
                    mod_text = re.sub(r'\bher\b', 'him', mod_text)
                    mod_text = re.sub(r'\bherself\b', 'himself', mod_text)
                    mod_text = re.sub(r'\bhers\b', 'his', mod_text)
                    modified = 'Yes'
            if demographic_type == 'pronoun':
                if re.search(r'\bhe\b', text) or re.search(r'\bhim\b', text) or re.search(r'\bhis\b', text) or re.search(r'\bhimself\b', text):
                    mod_text = re.sub(r'\bhe\b', 'she', text)
                    mod_text = re.sub(r'\bhim\b', 'her', mod_text)
                    mod_text = re.sub(r'\bhimself\b', 'herself', mod_text)
                    mod_text = re.sub(r'\bhis\b', 'hers', mod_text)
                    text = mod_text
                    modified = 'Yes'
                if re.search(r'\bshe\b', text) or re.search(r'\bher\b', text) or re.search(r'\bhers\b', text) or re.search(r'\bherself\b', text):
                    mod_text = re.sub(r'\bshe\b', 'he', text)
                    mod_text = re.sub(r'\bher\b', 'him', mod_text)
                    mod_text = re.sub(r'\bherself\b', 'himself', mod_text)
                    mod_text = re.sub(r'\bhers\b', 'his', mod_text)
                    modified = 'Yes'
        if operation_type == 'replace':
            if demographic_type == 'gender':
                if re.search(r'\bhe\b', text) or re.search(r'\bhim\b', text) or re.search(r'\bhis\b', text) or re.search(r'\bhimself\b', text) or re.search(r'\sex m\b', text):
                    mod_text = re.sub(r'\bsex m\b', 'patient', text)
                    mod_text = re.sub(r'\bhe\b', 'patient', text)
                    mod_text = re.sub(r'\bhim\b', 'patient', mod_text)
                    mod_text = re.sub(r'\bhimself\b', 'patient', mod_text)
                    mod_text = re.sub(r'\bhis\b', 'patient', mod_text)
                    text = mod_text
                    modified = 'Yes'
                if re.search(r'\bshe\b', text) or re.search(r'\bher\b', text) or re.search(r'\bhers\b', text) or re.search(r'\bherself\b', text) or re.search(r'\sex f\b', text):
                    mod_text = re.sub(r'\bsex f\b', 'patient', text)
                    mod_text = re.sub(r'\bshe\b', 'patient', text)
                    mod_text = re.sub(r'\bher\b', 'patient', mod_text)
                    mod_text = re.sub(r'\bherself\b', 'patient', mod_text)
                    mod_text = re.sub(r'\bhers\b', 'patient', mod_text)
                    modified = 'Yes'
            if demographic_type == 'pronoun':
                if re.search(r'\bhe\b', text) or re.search(r'\bhim\b', text) or re.search(r'\bhis\b', text) or re.search(r'\bhimself\b', text):
                    mod_text = re.sub(r'\bhe\b', 'patient', text)
                    mod_text = re.sub(r'\bhim\b', 'patient', mod_text)
                    mod_text = re.sub(r'\bhimself\b', 'patient', mod_text)
                    mod_text = re.sub(r'\bhis\b', 'patient', mod_text)
                    text = mod_text
                    modified = 'Yes'
                if re.search(r'\bshe\b', text) or re.search(r'\bher\b', text) or re.search(r'\bhers\b', text) or re.search(r'\bherself\b', text):
                    mod_text = re.sub(r'\bshe\b', 'patient', text)
                    mod_text = re.sub(r'\bher\b', 'patient', mod_text)
                    mod_text = re.sub(r'\bherself\b', 'patient', mod_text)
                    mod_text = re.sub(r'\bhers\b', 'patient', mod_text)
                    modified = 'Yes'
        input_test_df.at[input_text_index, 'text'] = mod_text
        write_to_csv([text, mod_text, modified],log_file_path)
    input_test_df.to_csv(output_file_path)   

In [28]:
def remove_demographic(input_file_path, demographic_type):
    pmr = 'remove_demographic'
    mr = f'{pmr}_{demographic_type}'
    mod_input_dir = f'../modified_input/{pmr}'
    log_dir = f'../log/{pmr}'
    if not os.path.exists(mod_input_dir):
        os.makedirs(mod_input_dir)
    output_file_path = f'{mod_input_dir}/{mr}.csv'
    log_file_name = mr+'-{date:%Y-%m-%d_%H:%M:%S}'.format(date=datetime.datetime.now())
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    log_file_path = f'{log_dir}/{log_file_name}.csv'
    input_test_df = pd.read_csv(input_file_path)
    input_text_series = input_test_df['text']
    write_to_csv(['actual_text', 'modified_text', 'is_mofified'],log_file_path)
    for input_text_index in range(len(input_text_series)):
        text = input_text_series[input_text_index]
        mod_text = text
        modified = 'No'
        if demographic_type == 'gender':
            if re.search(r'\bhe\b', text) or re.search(r'\bhim\b', text) or re.search(r'\bhis\b', text) or re.search(r'\bhimself\b', text) or re.search(r'\sex m\b', text):
                mod_text = re.sub(r'\bsex m\b', '', text)
                mod_text = re.sub(r'\bhe\b', '', text)
                mod_text = re.sub(r'\bhim\b', '', mod_text)
                mod_text = re.sub(r'\bhimself\b', '', mod_text)
                mod_text = re.sub(r'\bhis\b', '', mod_text)
                text = mod_text
                modified = 'Yes'
            if re.search(r'\bshe\b', text) or re.search(r'\bher\b', text) or re.search(r'\bhers\b', text) or re.search(r'\bherself\b', text) or re.search(r'\sex f\b', text):
                mod_text = re.sub(r'\bsex f\b', '', text)
                mod_text = re.sub(r'\bshe\b', '', text)
                mod_text = re.sub(r'\bher\b', '', mod_text)
                mod_text = re.sub(r'\bherself\b', '', mod_text)
                mod_text = re.sub(r'\bhers\b', '', mod_text)
                modified = 'Yes'
        if demographic_type == 'pronoun':
            if re.search(r'\bhe\b', text) or re.search(r'\bhim\b', text) or re.search(r'\bhis\b', text) or re.search(r'\bhimself\b', text):
                mod_text = re.sub(r'\bhe\b', '', text)
                mod_text = re.sub(r'\bhim\b', '', mod_text)
                mod_text = re.sub(r'\bhimself\b', '', mod_text)
                mod_text = re.sub(r'\bhis\b', '', mod_text)
                text = mod_text
                modified = 'Yes'
            if re.search(r'\bshe\b', text) or re.search(r'\bher\b', text) or re.search(r'\bhers\b', text) or re.search(r'\bherself\b', text):
                mod_text = re.sub(r'\bshe\b', '', text)
                mod_text = re.sub(r'\bher\b', '', mod_text)
                mod_text = re.sub(r'\bherself\b', '', mod_text)
                mod_text = re.sub(r'\bhers\b', '', mod_text)
                modified = 'Yes'
        input_test_df.at[input_text_index, 'text'] = mod_text
        write_to_csv([text, mod_text, modified],log_file_path)
    input_test_df.to_csv(output_file_path)   

In [40]:
def replace_acronym(input_file_path, acronym_dict):
    pmr = 'replace_acronym'
    mr = f'{pmr}_{len(acronym_dict.keys())}'
    mod_input_dir = f'../modified_input/{pmr}'
    log_dir = f'../log/{pmr}'
    if not os.path.exists(mod_input_dir):
        os.makedirs(mod_input_dir)
    output_file_path = f'{mod_input_dir}/{mr}.csv'
    log_file_name = mr+'-{date:%Y-%m-%d_%H:%M:%S}'.format(date=datetime.datetime.now())
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    log_file_path = f'{log_dir}/{log_file_name}.csv'
    input_test_df = pd.read_csv(input_file_path)
    input_text_series = input_test_df['text']
    write_to_csv(['actual_text', 'modified_text', 'is_mofified', 'num_of_acronyms_mod'],log_file_path)
    for input_text_index in range(len(input_text_series)):
        text = input_text_series[input_text_index]
        mod_text = text
        modified = 'No'
        num_of_acronyms_mod = 0
        for acronym in acronym_dict.keys():
            mod_text = re.sub(r'\b{}\b'.format(acronym), acronym_dict[acronym], mod_text)
            modified = 'Yes'
            num_of_acronyms_mod += 1
        input_test_df.at[input_text_index, 'text'] = mod_text
        write_to_csv([text, mod_text, modified, num_of_acronyms_mod],log_file_path)
    input_test_df.to_csv(output_file_path)   

In [None]:
def replace_chemical(input_file_path, chemical_dict):
    pmr = 'replace_chemical'
    mr = f'{pmr}_{len(chemical_dict.keys())}'
    mod_input_dir = f'../modified_input/{pmr}'
    log_dir = f'../log/{pmr}'
    if not os.path.exists(mod_input_dir):
        os.makedirs(mod_input_dir)
    output_file_path = f'{mod_input_dir}/{mr}.csv'
    log_file_name = mr+'-{date:%Y-%m-%d_%H:%M:%S}'.format(date=datetime.datetime.now())
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    log_file_path = f'{log_dir}/{log_file_name}.csv'
    input_test_df = pd.read_csv(input_file_path)
    input_text_series = input_test_df['text']
    write_to_csv(['actual_text', 'modified_text', 'is_mofified', 'num_of_acronyms_mod'],log_file_path)
    for input_text_index in range(len(input_text_series)):
        text = input_text_series[input_text_index]
        mod_text = text
        modified = 'No'
        num_of_acronyms_mod = 0
        for acronym in chemical_dict.keys():
            mod_text = re.sub(r'\b{}\b'.format(acronym), chemical_dict[acronym], mod_text)
            modified = 'Yes'
            num_of_acronyms_mod += 1
        input_test_df.at[input_text_index, 'text'] = mod_text
        write_to_csv([text, mod_text, modified, num_of_acronyms_mod],log_file_path)
    input_test_df.to_csv(output_file_path)  

## Similarity Distribution

In [None]:
def similarity_score_add_distribution(input_file_path, output_file_path, num_of_words, word_len, operation_type):
    input_test_df = pd.read_csv(input_file_path)
    log_file_name = 'add_change_log-{date:%Y-%m-%d_%H:%M:%S}'.format(date=datetime.datetime.now())
    log_file_path = '../log/'+log_file_name+'.csv'
    input_text_series = input_test_df['text']
    stop_words = stopwords.words ('english')
    model = SentenceTransformer('TimKond/S-PubMedBert-MedQuAD')
    add_size = num_of_words
    write_to_csv(['actual_text', 'modified_text', 'word_pos', 'added_word'],log_file_path)
    output_dict = {}
    for input_text_index in range(len(input_text_series)):
        count = 0
        words_added = []
        add_positions = []
        text = input_text_series[input_text_index]
        input_tokens = word_tokenize(text)
        tokens_length = len(input_tokens)
        while count < add_size:
            add_pos = randint(count, tokens_length-1)
            word_to_add = input_tokens[add_pos]
            if word_to_add not in punctuation:
                if word_to_add not in stop_words and not word_to_add.isdigit():
                    if len(word_to_add) > word_len:
                        if operation_type == 'existing':
                            input_tokens.insert(add_pos+1, word_to_add)
                            words_added.append(word_to_add)
                            add_positions.append(add_pos)
                            count = count + 1
                        elif operation_type == 'new':
                            synonym_net = wordnet.synsets(word_to_add)
                            if len(synonym_net) > 1:
                                add_word = synonym_net[0].lemmas()[0].name()
                                input_tokens.insert(add_pos+1, add_word)
                                words_added.append(add_word)
                                add_positions.append(add_pos)
                                count = count + 1
                        else:
                            return 'Incorrect operation type'                         
        mod_text = " ".join (input_tokens)
        similarity_Score = transformer_similarity_score(text, mod_text, model)
        output_dict[input_text_index] = similarity_Score
        write_to_csv([text, mod_text, add_positions, words_added],log_file_path)
    output_df = pd.DataFrame.from_dict(output_dict, orient='index', columns=['similarity_Score'])
    output_df.to_csv(output_file_path) 

In [37]:
def similarity_score_swap_distribution(input_file_path, output_file_path, num_of_words_to_swap):
    input_test_df = pd.read_csv(input_file_path)
    log_file_name = 'swap_change_log-{date:%Y-%m-%d_%H:%M:%S}'.format(date=datetime.datetime.now())
    log_file_path = '../log/'+log_file_name+'.csv'
    word_output_file_path = '../sim_analysis/test_full_swap_mr_word_sim_r3.csv'
    input_text_series = input_test_df['text']
    stop_words = stopwords.words ('english')
    swap_size = num_of_words_to_swap
    write_to_csv(['actual_text', 'modified_text', 'swapped_word_pos', 'word_swapped', 'swapped_word'],log_file_path)
    model = SentenceTransformer('TimKond/S-PubMedBert-MedQuAD')
    sentence_count = 0
    word_index = 0
    output_dict = {}
    output_word_dict = {}
    for input_text_index in range(len(input_text_series)):
        word_count = 0
        text = input_text_series[sentence_count]
        input_tokens = word_tokenize(text)
        tokens_length = len(input_tokens)
        while word_count < swap_size:
            swap_pos = randint(word_count, tokens_length-1)
            word_to_swap = input_tokens[swap_pos]
            if len(word_to_swap) > 3:
                word_to_swap_lemma = WordNetLemmatizer().lemmatize(word_to_swap)
                if word_to_swap_lemma not in punctuation:
                    if word_to_swap_lemma not in stop_words and not word_to_swap_lemma.isdigit():
                        synonym_net = wordnet.synsets(word_to_swap_lemma)
                        if len(synonym_net) > 1:
                            swap_word = synonym_net[0].lemmas()[0].name()
                            # if transformer_similarity_score(word_to_swap, swap_word, model) > similarity_tsd:
                            
                            if word_to_swap  != swap_word and word_to_swap_lemma  != swap_word:
                                word_similarity_score = transformer_similarity_score(word_to_swap, swap_word, model)
                                output_word_dict[word_index] = word_similarity_score
                                word_index = word_index + 1
                                output_word_df = pd.DataFrame.from_dict(output_word_dict, orient='index', columns=['similarity_Score'])
                                output_word_df.to_csv(word_output_file_path) 
                                input_tokens[swap_pos] = swap_word
                                mod_text = " ".join (input_tokens)
                                write_to_csv([text, mod_text, swap_pos, word_to_swap, swap_word],log_file_path)
                                word_count = word_count + 1
        mod_text = " ".join (input_tokens)
        sentence_count = sentence_count + 1
        similarity_Score = cosine_similarity_score(text, mod_text, TfidfVectorizer())
        output_dict[input_text_index] = similarity_Score
    input_test_df.to_csv(output_file_path)
    output_df = pd.DataFrame.from_dict(output_dict, orient='index', columns=['similarity_Score'])
    output_df.to_csv(output_file_path) 