In [1]:
import numpy as np
from collections import Counter, defaultdict
from itertools import chain
from typing import List

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

import unicodedata
import string

import pandas as pd

[nltk_data] Downloading package punkt_tab to /home/pe/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
dct_to_remove = {w:[] for w in [1, 2, 3]}
print(dct_to_remove)

{1: [], 2: [], 3: []}


In [3]:
def get_ground_truth_cooccurrences(corpus:List[List[str]], target_words:List[str], window_size=2):
    """
    Get real concurrence present in corpus.
    Returns:
        list: Frequencies co-occurring words, for each target_word
    """
    co_occurring_tmp = {w:[] for w in target_words}
    co_occurring = {w:[] for w in target_words}

    for sentence in corpus:
        for i, token in enumerate(sentence):
            if token in target_words:
                start = max(0, i - window_size)
                end = min(len(sentence), i + window_size + 1)
                context = sentence[start:i] + sentence[i+1:end]
                co_occurring_tmp[token].extend(context)

    for w in target_words:
        co_occurring[w] = Counter(co_occurring_tmp[w])
    return co_occurring

In [4]:
def find_must_frequent_word(corpus):
    """
    Find must frequent word (stop word for a corpus)
    """
    all_words_iterator = chain.from_iterable(corpus)
    word_counts = Counter(all_words_iterator)
    total_word_count = sum(word_counts.values())

    results = []

    for rank, (word, count) in enumerate(word_counts.items(), 1):
        percentage = (count / total_word_count) * 100
        results.append((word, count, round(percentage, 4)))
        print(f"{rank:<5} | {word:<15} | {count:<10} | {percentage:.2f}%")
        
    return results


In [5]:
def compute_co_occurrence_matrix(corpus:List[List[str]], window_size:int=1):
    all_tokens = list(chain(*corpus))

    word_list = sorted(list(set(all_tokens))) # Obtain same list each iteration

    word_to_index = {word: i for i, word in enumerate(word_list)}

    vocab_size = len(word_list)

    co_occurrence_counts = defaultdict(int)

    for sentence in corpus:
        for i, target_word in enumerate(sentence):
            start_index = max(0, i - window_size)
            end_index = min(len(sentence), i + window_size + 1)

            for j in range(start_index, end_index):
                if i == j:
                    continue
                
                context_word = sentence[j]

                target_idx = word_to_index[target_word]
                context_idx = word_to_index[context_word]
                
                pair_key = tuple(sorted((target_idx, context_idx)))
                
                co_occurrence_counts[pair_key] += 1

    co_occurrence_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int8)

    for (idx_a, idx_b), count in co_occurrence_counts.items():
        co_occurrence_matrix[idx_a, idx_b] = count
        co_occurrence_matrix[idx_b, idx_a] = count
        
    return co_occurrence_matrix, word_list, word_to_index

In [6]:
contraction_map = {
    "n't": " n't", "'re": " 're", "'ve": " 've", "'ll": " 'll",
    "'d": " 'd", "'s": " 's", "'m": " 'm"
}

def remove_accents(text: str) -> str:
    nk = unicodedata.normalize("NFKD", text)
    return "".join(ch for ch in nk if not unicodedata.combining(ch))

keep = {"'", "’"}
base_punct = set(string.punctuation)
extra_punct = set('“”‘’—–…«»')
punct_to_remove = (base_punct | extra_punct) - keep
TRANSL_TABLE = str.maketrans('', '', ''.join(sorted(punct_to_remove)))

tokens_by_sentence: List[List[str]] = []

stop_words = ["le", "les", "sur", "jouer", "fait", "de", "et", "la", "des", "sont"]+ \
    ["a", "of", "and", "is", "in", "to", "one", "s", "or", "as"]


with open("GPT5v2.txt", encoding="utf-8") as f:
    for line in f:
        s = line.strip().lower()
        if not s:
            continue
        s = remove_accents(s)
        s = s.translate(TRANSL_TABLE)
        s2 = [word for word in s.split() if word not in stop_words]
        s = " ".join(s2)
        toks = word_tokenize(s, language="english")

        if toks:
            tokens_by_sentence.append(toks)

In [7]:
# get_ground_truth_cooccurrences(tokens_by_sentence, target_words=["terrain", "dog"], window_size=3)

In [8]:
# print(find_must_frequent_word(tokens_by_sentence))

In [9]:
coo_matrix, word_list, word_to_index = compute_co_occurrence_matrix(tokens_by_sentence, 5)

cooc_df = pd.DataFrame(
    data=coo_matrix,
    index=word_list,
    columns=word_list
)

