# Method for a single sentence

In [2]:
import numpy as np
from gensim.models import KeyedVectors
from scipy.spatial.distance import jensenshannon
from tqdm import tqdm
import re

def word_perturbation_analysis(sentence, model, num_neighbors=10, num_bootstraps=500):
    def get_embedding(text):
        return model[text] if text in model else np.zeros(model.vector_size)
    
    def calculate_jsd_and_pvalue(arr1, arr2, num_bootstraps=1000, bins=30):
        def calculate_jsd(x, y):
            hist1, _ = np.histogram(x, bins=bins, density=True)
            hist2, _ = np.histogram(y, bins=bins, density=True)
            return jensenshannon(hist1, hist2)
        
        observed_jsd = calculate_jsd(arr1, arr2)
        
        combined = np.concatenate([arr1, arr2])
        n1, n2 = len(arr1), len(arr2)
        
        bootstrap_jsds = []
        for _ in range(num_bootstraps):
            resampled = np.random.choice(combined, size=n1+n2, replace=True)
            bootstrap_arr1, bootstrap_arr2 = resampled[:n1], resampled[n1:]
            bootstrap_jsd = calculate_jsd(bootstrap_arr1, bootstrap_arr2)
            bootstrap_jsds.append(bootstrap_jsd)
        
        p_value = np.mean(np.array(bootstrap_jsds) >= observed_jsd)
        
        return observed_jsd, p_value
    
    def generate_numerical_neighbors(value, num_neighbors=20):
        try:
            num_value = float(value)
            lower_bound = num_value * 0.5
            upper_bound = num_value * 1.5
            lower_neighbors = np.linspace(lower_bound, num_value, num=10, endpoint=False)
            upper_neighbors = np.linspace(num_value, upper_bound, num=11)[1:]  # Exclude the original value
            return list(np.round(np.concatenate([lower_neighbors, upper_neighbors]), 4))
        except ValueError:
            return None  # Return None if the value can't be converted to float
    
    # Tokenize the sentence
    words = re.findall(r'\b\w+\b|[^\w\s]', sentence)
    
    # Compute baseline embeddings
    baseline_embeddings = np.array([get_embedding(w) for w in words])
    
    results = {}
    
    for i, word in enumerate(tqdm(words, desc="Processing words")):
        numerical_neighbors = generate_numerical_neighbors(word)
        
        if numerical_neighbors:
            neighbors = [(str(n), 0) for n in numerical_neighbors]  # Format to match model.most_similar() output
        elif word.lower() in model.key_to_index:
            neighbors = model.most_similar(word.lower(), topn=num_neighbors)
        else:
            print(f"Warning: '{word}' is neither a number nor in vocabulary. Skipping.")
            continue
        
        distances = []
        p_values = []
        
        for neighbor, _ in tqdm(neighbors, desc=f"Analyzing neighbors of '{word}'", leave=False):
            # Replace the word with its neighbor in the sentence
            new_words = words.copy()
            new_words[i] = neighbor
            
            # Get embeddings for modified sentence
            modified_embeddings = np.array([get_embedding(w) for w in new_words])
            
            # Calculate JSD and p-value
            jsd, p_value = calculate_jsd_and_pvalue(baseline_embeddings.flatten(), modified_embeddings.flatten(), num_bootstraps)
            
            distances.append(jsd)
            p_values.append(p_value)
        
        results[word] = (np.mean(distances), np.mean(p_values))
    
    return results

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Apply the function to the given text
text_legal = """
Company A agrees to pay Company B $10 million for developing a revolutionary AI software within 12 months. If Company B fails to deliver a fully functional product by the deadline, they must refund 50% of the payment and provide an additional 3 months of development at no extra cost. However, if the delay is due to circumstances beyond Company B's reasonable control, these penalties shall not apply. This agreement is governed by California law and any disputes shall be resolved through binding arbitration.
"""

# Uncomment the following line when you have a model loaded
results = word_perturbation_analysis(text_legal, model)

# print(results)

Processing words:   1%|          | 1/95 [00:10<16:43, 10.68s/it]



Processing words:   3%|▎         | 3/95 [00:16<07:22,  4.81s/it]



Processing words:  13%|█▎        | 12/95 [01:04<08:03,  5.82s/it]



Processing words:  20%|██        | 19/95 [01:42<07:53,  6.23s/it]



Processing words:  25%|██▌       | 24/95 [02:04<06:05,  5.15s/it]



Processing words:  27%|██▋       | 26/95 [02:10<04:43,  4.11s/it]



Processing words:  35%|███▍      | 33/95 [02:43<05:11,  5.03s/it]



Processing words:  41%|████      | 39/95 [03:15<05:35,  6.00s/it]



Processing words:  44%|████▍     | 42/95 [03:26<04:13,  4.79s/it]



Processing words:  51%|█████     | 48/95 [03:58<04:38,  5.93s/it]



Processing words:  57%|█████▋    | 54/95 [04:26<03:33,  5.20s/it]



Processing words:  59%|█████▉    | 56/95 [04:31<02:40,  4.12s/it]



Processing words:  65%|██████▌   | 62/95 [04:59<02:40,  4.85s/it]



Processing words:  71%|███████   | 67/95 [05:20<02:15,  4.83s/it]



Processing words:  75%|███████▍  | 71/95 [05:37<01:50,  4.60s/it]



Processing words:  81%|████████  | 77/95 [06:04<01:29,  4.97s/it]



Processing words:  89%|████████▉ | 85/95 [06:43<00:52,  5.27s/it]



Processing words: 100%|██████████| 95/95 [07:27<00:00,  4.71s/it]






In [3]:
results

{'Company': (0.0023326451924539963, 1.0),
 'agrees': (0.0023018988439325937, 1.0),
 'pay': (0.027730039654968112, 0.937),
 'B': (0.002297112607344631, 1.0),
 '$': (0.002946541090333569, 1.0),
 '10': (0.0, 1.0),
 'million': (0.0019037648851699795, 1.0),
 'for': (0.00229067763214912, 1.0),
 'developing': (0.0021939689258674267, 1.0),
 'revolutionary': (0.002486883853655971, 1.0),
 'AI': (0.002545750593027174, 1.0),
 'software': (0.002280553645519246, 1.0),
 'within': (0.0018739667886285417, 1.0),
 '12': (0.0, 1.0),
 'months': (0.0014203138060697698, 1.0),
 'If': (0.0023753571959398596, 1.0),
 'fails': (0.0021909781915360692, 1.0),
 'deliver': (0.0022303960956712404, 1.0),
 'fully': (0.0019511600597358308, 1.0),
 'functional': (0.00261505385548873, 1.0),
 'product': (0.028300898425595616, 0.93),
 'by': (0.026966281707135477, 0.9376),
 'the': (0.0017831414527882817, 1.0),
 'deadline': (0.08552254198321639, 0.7696),
 'they': (0.0013949756399427454, 1.0),
 'must': (0.0016324051317312964, 1.0

In [4]:
print(results)

{'Company': (0.0023326451924539963, 1.0), 'agrees': (0.0023018988439325937, 1.0), 'pay': (0.027730039654968112, 0.937), 'B': (0.002297112607344631, 1.0), '$': (0.002946541090333569, 1.0), '10': (0.0, 1.0), 'million': (0.0019037648851699795, 1.0), 'for': (0.00229067763214912, 1.0), 'developing': (0.0021939689258674267, 1.0), 'revolutionary': (0.002486883853655971, 1.0), 'AI': (0.002545750593027174, 1.0), 'software': (0.002280553645519246, 1.0), 'within': (0.0018739667886285417, 1.0), '12': (0.0, 1.0), 'months': (0.0014203138060697698, 1.0), 'If': (0.0023753571959398596, 1.0), 'fails': (0.0021909781915360692, 1.0), 'deliver': (0.0022303960956712404, 1.0), 'fully': (0.0019511600597358308, 1.0), 'functional': (0.00261505385548873, 1.0), 'product': (0.028300898425595616, 0.93), 'by': (0.026966281707135477, 0.9376), 'the': (0.0017831414527882817, 1.0), 'deadline': (0.08552254198321639, 0.7696), 'they': (0.0013949756399427454, 1.0), 'must': (0.0016324051317312964, 1.0), 'refund': (0.066323907

In [5]:
text_legal = """

Company A agrees to pay Company B $10 million for developing a revolutionary AI software within 12 months. If Company B fails to deliver a fully functional product by the deadline, they must refund 50% of the payment and provide an additional 3 months of development at no extra cost. However, if the delay is due to circumstances beyond Company B's reasonable control, these penalties shall not apply. This agreement is governed by California law and any disputes shall be resolved through binding arbitration.
"""

In [6]:
# Proceed to generate LaTeX code using the results
def generate_latex_highlighted_text(sentence, results):
    # Function to escape special LaTeX characters
    def escape_latex(s):
        special_chars = {
            '\\': r'\textbackslash{}',
            '{': r'\{',
            '}': r'\}',
            '$': r'\$',
            '&': r'\&',
            '#': r'\#',
            '_': r'\_',
            '%': r'\%',
            '^': r'\^{}',
            '~': r'\~{}',
        }
        return ''.join(special_chars.get(c, c) for c in s)
    
    # Extract scores and find min and max scores
    scores = [value[0] for value in results.values()]
    min_score = min(scores)
    max_score = max(scores)
    
    # Linear scaling function
    def linear_scale(score, min_score, max_score):
        if max_score > min_score:
            return ((score - min_score) / (max_score - min_score)) * 100
        else:
            return 0
    
    # Tokenize the sentence
    tokens = re.findall(r'\s+|\w+|[^\w\s]', sentence)
    
    # Prepare the LaTeX output
    latex_output = ''
    for token in tokens:
        if token.strip():
            # Remove leading/trailing punctuation for matching
            word_clean = token.strip()
            word_clean_lower = re.sub(r'^\W+|\W+$', '', word_clean).lower()
    
            # Get the score for the word, defaulting to min_score if not found
            score = results.get(word_clean, results.get(word_clean_lower, (min_score, )))[0]
    
            # Apply linear scaling
            percentage = int(linear_scale(score, min_score, max_score))
    
            # Escape LaTeX special characters
            escaped_token = escape_latex(token)
    
            # Generate LaTeX code
            if percentage == 0:
                latex_output += escaped_token
            else:
                latex_output += r'\colorbox{{red!{}}}{{{}}}'.format(percentage, escaped_token)
        else:
            # Preserve spaces
            latex_output += token
    
    return latex_output

# Generate LaTeX code (uncomment when 'results' is available)
latex_code = generate_latex_highlighted_text(text_legal, results)
print(latex_code)



Company A agrees to \colorbox{red!10}{pay} Company B \colorbox{red!1}{\$}10 million for developing a revolutionary \colorbox{red!1}{AI} software within 12 months. If Company B fails to deliver a fully \colorbox{red!1}{functional} \colorbox{red!11}{product} \colorbox{red!10}{by} the \colorbox{red!33}{deadline}, they must \colorbox{red!26}{refund} 50\colorbox{red!1}{\%} of the \colorbox{red!2}{payment} and \colorbox{red!10}{provide} \colorbox{red!1}{an} additional \colorbox{red!2}{3} months of development at no extra \colorbox{red!12}{cost}. However, if the \colorbox{red!15}{delay} \colorbox{red!10}{is} \colorbox{red!1}{due} to \colorbox{red!2}{circumstances} beyond Company B's reasonable \colorbox{red!1}{control}, these \colorbox{red!45}{penalties} \colorbox{red!1}{shall} not \colorbox{red!10}{apply}. This \colorbox{red!12}{agreement} \colorbox{red!10}{is} \colorbox{red!1}{governed} \colorbox{red!10}{by} \colorbox{red!25}{California} \colorbox{red!15}{law} and any \colorbox{red!100}{d

In [7]:
import re
from colorsys import hsv_to_rgb

def score_to_color(score, min_score, max_score):
    if max_score == min_score:
        normalized_score = 1
    else:
        normalized_score = (score - min_score) / (max_score - min_score)
    hue = 0  # Red hue
    saturation = 1
    value = 1 - normalized_score
    r, g, b = hsv_to_rgb(hue, saturation, value)
    return f"{r:.3f},{g:.3f},{b:.3f}"

def generate_latex_highlighted_text(text, results):
    # Split text into words and punctuation
    tokens = re.findall(r'\w+|[^\w\s]|\s+', text)
    latex_tokens = []
    
    scores_dict = {k.lower(): v[0] for k, v in results.items()}
    min_score = min(scores_dict.values())
    max_score = max(scores_dict.values())
    
    for token in tokens:
        if token.strip():
            score = scores_dict.get(token.lower(), min_score)
            color = score_to_color(score, min_score, max_score)
            latex_tokens.append(f"\\textcolor[rgb]{{{color}}}{{{token}}}")
        else:
            latex_tokens.append(token)
    
    return ''.join(latex_tokens)


highlighted_text = generate_latex_highlighted_text(text_legal, results)
print(highlighted_text)



\textcolor[rgb]{0.991,0.000,0.000}{Company} \textcolor[rgb]{1.000,0.000,0.000}{A} \textcolor[rgb]{0.991,0.000,0.000}{agrees} \textcolor[rgb]{1.000,0.000,0.000}{to} \textcolor[rgb]{0.891,0.000,0.000}{pay} \textcolor[rgb]{0.991,0.000,0.000}{Company} \textcolor[rgb]{0.991,0.000,0.000}{B} \textcolor[rgb]{0.988,0.000,0.000}{$}\textcolor[rgb]{1.000,0.000,0.000}{10} \textcolor[rgb]{0.993,0.000,0.000}{million} \textcolor[rgb]{0.991,0.000,0.000}{for} \textcolor[rgb]{0.991,0.000,0.000}{developing} \textcolor[rgb]{1.000,0.000,0.000}{a} \textcolor[rgb]{0.990,0.000,0.000}{revolutionary} \textcolor[rgb]{0.990,0.000,0.000}{AI} \textcolor[rgb]{0.991,0.000,0.000}{software} \textcolor[rgb]{0.993,0.000,0.000}{within} \textcolor[rgb]{1.000,0.000,0.000}{12} \textcolor[rgb]{0.994,0.000,0.000}{months}\textcolor[rgb]{1.000,0.000,0.000}{.} \textcolor[rgb]{0.995,0.000,0.000}{If} \textcolor[rgb]{0.991,0.000,0.000}{Company} \textcolor[rgb]{0.991,0.000,0.000}{B} \textcolor[rgb]{0.991,0.000,0.000}{fails} \textcol

In [8]:
import re


# Tokenize the text
words = re.findall(r'\b\w+\b|[^\w\s]', text_legal)

# Get min and max scores
scores = [value[0] for value in results.values()]
min_score = min(scores)
max_score = max(scores)

# Function to escape LaTeX special characters
def escape_latex(s):
    special_chars = {
        '\\': '\\textbackslash{}',
        '&': '\\&',
        '%': '\\%',
        '$': '\\$',
        '#': '\\#',
        '_': '\\_',
        '{': '\\{',
        '}': '\\}',
        '~': '\\textasciitilde{}',
        '^': '\\textasciicircum{}',
        "'": "\\'",
        '"': '\\"',
        '`': '\\`',
    }
    for char, escape in special_chars.items():
        s = s.replace(char, escape)
    return s

# Generate LaTeX code
latex_output = ""
for word in words:
    word_clean = word.strip()
    if word_clean == '':
        continue  # Skip empty strings
    if word_clean in results:
        score = results[word_clean][0]
        # Normalize the score to 1-100%
        if max_score - min_score == 0:
            percentage = 100  # Avoid division by zero
        else:
            percentage = (score - min_score) / (max_score - min_score) * 99 + 1  # Ensure minimum 1%
        percentage_int = int(round(percentage))
        escaped_word = escape_latex(word)
        latex_output += f"\\colorbox{{red!{percentage_int}}}{{{escaped_word}}} "
    else:
        # Leave words without values unmodified
        escaped_word = escape_latex(word)
        latex_output += f"{escaped_word} "

# Print LaTeX output
print(latex_output)


\colorbox{red!2}{Company} A \colorbox{red!2}{agrees} to \colorbox{red!12}{pay} \colorbox{red!2}{Company} \colorbox{red!2}{B} \colorbox{red!2}{\$} \colorbox{red!1}{10} \colorbox{red!2}{million} \colorbox{red!2}{for} \colorbox{red!2}{developing} a \colorbox{red!2}{revolutionary} \colorbox{red!2}{AI} \colorbox{red!2}{software} \colorbox{red!2}{within} \colorbox{red!1}{12} \colorbox{red!2}{months} . \colorbox{red!2}{If} \colorbox{red!2}{Company} \colorbox{red!2}{B} \colorbox{red!2}{fails} to \colorbox{red!2}{deliver} a \colorbox{red!2}{fully} \colorbox{red!2}{functional} \colorbox{red!12}{product} \colorbox{red!12}{by} \colorbox{red!2}{the} \colorbox{red!34}{deadline} , \colorbox{red!2}{they} \colorbox{red!2}{must} \colorbox{red!27}{refund} \colorbox{red!1}{50} \colorbox{red!2}{\%} of \colorbox{red!2}{the} \colorbox{red!3}{payment} and \colorbox{red!11}{provide} \colorbox{red!2}{an} \colorbox{red!2}{additional} \colorbox{red!4}{3} \colorbox{red!2}{months} of \colorbox{red!2}{development} \