In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
from collections import defaultdict, Counter
from tqdm import tqdm

In [2]:
reference = pd.read_csv('NIMP_reference_corpus.csv')
reference = reference[reference['language'] == 'en']
reference

Unnamed: 0,body,language
1,Seeking information on digital strategies that...,en
2,I am contacting you to request information on ...,en
4,"Dear Customer Support, I am reaching out to in...",en
5,Inquiring about best practices for securing me...,en
7,"The integration stopped working unexpectedly, ...",en
...,...,...
19992,Seeking details on securing medical data using...,en
19993,Can you provide information on digital strateg...,en
19994,Request for assistance in improving digital ma...,en
19995,I am facing integration problems with IFTTT Do...,en


In [3]:
reference['body'] = reference['body'].str.replace('_', ' ')
reference['tokens'] = reference['body'].str.split()
tokens = sum(reference['tokens'].dropna().tolist(), [])

In [4]:
reference

Unnamed: 0,body,language,tokens
1,Seeking information on digital strategies that...,en,"[Seeking, information, on, digital, strategies..."
2,I am contacting you to request information on ...,en,"[I, am, contacting, you, to, request, informat..."
4,"Dear Customer Support, I am reaching out to in...",en,"[Dear, Customer, Support,, I, am, reaching, ou..."
5,Inquiring about best practices for securing me...,en,"[Inquiring, about, best, practices, for, secur..."
7,"The integration stopped working unexpectedly, ...",en,"[The, integration, stopped, working, unexpecte..."
...,...,...,...
19992,Seeking details on securing medical data using...,en,"[Seeking, details, on, securing, medical, data..."
19993,Can you provide information on digital strateg...,en,"[Can, you, provide, information, on, digital, ..."
19994,Request for assistance in improving digital ma...,en,"[Request, for, assistance, in, improving, digi..."
19995,I am facing integration problems with IFTTT Do...,en,"[I, am, facing, integration, problems, with, I..."


In [5]:

# set window size
window_size = 10


import pandas as pd
import numpy as np
from itertools import combinations
from collections import defaultdict, Counter
from tqdm import tqdm

def compute_word_and_pair_probs(tokens, window_size=window_size, store_all_pairs=False):
    pair_counts = defaultdict(int)
    word_counts = Counter()
    total_windows = len(tokens) - window_size + 1

    print(f"Processing {total_windows} sliding windows...")

    with tqdm(total=total_windows, unit="win", unit_scale=True, dynamic_ncols=True, bar_format="{l_bar}{bar} {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]") as pbar:
        for i in range(total_windows):
            window_tokens = tokens[i:(i + window_size)]
            unique_words = set(window_tokens)  # Unique words in the window
            word_counts.update(unique_words)  # Faster word counting

            # Generate unique word pairs and count them
            for pair in combinations(unique_words, 2):
                pair_key = "_".join(sorted(pair))  # Ensure consistent ordering
                pair_counts[pair_key] += 1

            pbar.update(1)

    print("\nPair count over sliding window finished.")

    # Convert word counts to DataFrame
    word_probs = pd.DataFrame(word_counts.items(), columns=['word', 'count'])
    word_probs['probability'] = word_probs['count'] / total_windows
    word_probs.sort_values(by='probability', ascending=False, inplace=True)

    # Convert pair counts to DataFrame
    pair_probs = pd.DataFrame([(key.split('_')[0], key.split('_')[1], count) for key, count in pair_counts.items()],
                              columns=['word1', 'word2', 'count'])
    pair_probs['probability'] = pair_probs['count'] / total_windows
    pair_probs.sort_values(by='probability', ascending=False, inplace=True)

    print(f"Computing NPMI coherence for {len(pair_probs)} word pairs...")

    # Convert word probabilities to a dictionary for fast lookup
    word_prob_dict = dict(zip(word_probs['word'], word_probs['probability']))
    epsilon = 1e-12

    coherence_measures = []
    with tqdm(total=len(pair_probs), unit="pairs", unit_scale=True, dynamic_ncols=True, bar_format="{l_bar}{bar} {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]") as pbar:
        for _, row in pair_probs.iterrows():
            word_m, word_l, pair_prob = row['word1'], row['word2'], row['probability']

            # Fast dictionary lookups
            word_m_prob = word_prob_dict.get(word_m, epsilon)
            word_l_prob = word_prob_dict.get(word_l, epsilon)

            # NPMI Coherence Calculation
            pmi = (pair_prob + epsilon) / (word_m_prob * word_l_prob)
            npmi_coherence = np.log(pmi) / (-np.log(pair_prob + epsilon))

            coherence_measures.append({'word1': word_m, 'word2': word_l, 'pair_probability': pair_prob, 'npmi_coherence': npmi_coherence})
            pbar.update(1)

    coherence_df = pd.DataFrame(coherence_measures)

    # Store all word pairs if requested
    all_pairs_df = None
    if store_all_pairs:
        all_pairs_df = pd.DataFrame([tuple(sorted(pair_key.split('_'))) for pair_key in pair_counts.keys()], columns=['word1', 'word2'])

    print("Computation completed.")

    return {
        'word_probabilities': word_probs,
        'pair_probabilities': pair_probs,
        'all_word_pairs': all_pairs_df,
        'coherence_measures': coherence_df
    }

In [6]:
# Run the function
results = compute_word_and_pair_probs(tokens, window_size=window_size, store_all_pairs=True)
results

Processing 684555 sliding windows...


100%|██████████ 685k/685k [00:11<00:00, 59.3kwin/s] 



Pair count over sliding window finished.
Computing NPMI coherence for 854292 word pairs...


100%|██████████ 854k/854k [00:19<00:00, 43.1kpairs/s]  


Computation completed.


{'word_probabilities':                word   count  probability
 18               to  260220     0.380130
 13              the  232287     0.339326
 11              and  219489     0.320630
 27                I  140853     0.205758
 31              you  124466     0.181820
 ...             ...     ...          ...
 7979   integrators.      10     0.000015
 7980   unsanctioned      10     0.000015
 7981      problems;      10     0.000015
 7982           iOS.      10     0.000015
 11223       slowed,      10     0.000015
 
 [11224 rows x 3 columns],
 'pair_probabilities':               word1         word2  count  probability
 73              and           the  82224     0.120113
 122             the            to  76700     0.112044
 119             and            to  67486     0.098584
 207               I            to  57654     0.084221
 263               I            am  52453     0.076623
 ...             ...           ...    ...          ...
 304776  considering  requirements    

In [7]:
results=pd.DataFrame(results['coherence_measures'])
results["bigram"]=results["word1"]+" "+results["word2"]
results["bigram_synonym"]=results["word2"]+" "+results["word1"]

In [8]:
results.rename(columns={"npmi_coherence": "npmi"}, inplace=True)
results=results[["bigram","bigram_synonym","npmi"]]
results.to_csv("NPMI_mailing.csv",index=False)