In [None]:
import numpy as np
from collections import Counter, defaultdict
from itertools import chain
from typing import List

import math
import networkx as nx

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

import unicodedata
import string

import pandas as pd

from scipy.stats import entropy
from scipy.spatial.distance import pdist

import seaborn as sns
import plotly.figure_factory as ff

from pipData import *

from sklearn.metrics.pairwise import cosine_similarity
import re

In [None]:
dct_to_remove = {w:[] for w in [1, 2, 3]}
print(dct_to_remove)

In [None]:
def get_ground_truth_cooccurrences(corpus:List[List[str]], target_words:List[str], window_size=2):
    """
    Get real concurrence present in corpus.
    Returns:
        list: Frequencies co-occurring words, for each target_word
    """
    co_occurring_tmp = {w:[] for w in target_words}
    co_occurring = {w:[] for w in target_words}

    for sentence in corpus:
        for i, token in enumerate(sentence):
            if token in target_words:
                start = max(0, i - window_size)
                end = min(len(sentence), i + window_size + 1)
                context = sentence[start:i] + sentence[i+1:end]
                co_occurring_tmp[token].extend(context)

    for w in target_words:
        co_occurring[w] = Counter(co_occurring_tmp[w])
    return co_occurring

In [None]:
def find_must_frequent_word(corpus):
    """
    Find must frequent word (stop word for a corpus)
    """
    all_words_iterator = chain.from_iterable(corpus)
    word_counts = Counter(all_words_iterator)
    total_word_count = sum(word_counts.values())

    results = []

    for rank, (word, count) in enumerate(word_counts.items(), 1):
        percentage = (count / total_word_count) * 100
        results.append((word, count, round(percentage, 4)))
        print(f"{rank:<5} | {word:<15} | {count:<10} | {percentage:.2f}%")
        
    return results


In [None]:
def compute_co_occurrence_matrix(corpus, window_size=1):
    all_tokens = list(chain(*corpus))
    word_list = sorted(list(set(all_tokens)))
    word_to_index = {word: i for i, word in enumerate(word_list)}
    vocab_size = len(word_list)

    co_occurrence_counts = defaultdict(int)

    for sentence in corpus:
        indices = [word_to_index[word] for word in sentence]
        
        for i, target_idx in enumerate(indices):
            start_index = max(0, i - window_size)
            end_index = min(len(indices), i + window_size + 1)

            for j in range(start_index, end_index):
                if i == j:
                    co_occurrence_counts[(target_idx, target_idx)] += 1
                    continue
                
                context_idx = indices[j]
                
                pair_key = tuple(sorted((target_idx, context_idx)))
                co_occurrence_counts[pair_key] += 1

    co_occurrence_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int16)

    for (idx_a, idx_b), count in co_occurrence_counts.items():
        co_occurrence_matrix[idx_a, idx_b] = count
        if idx_a != idx_b:
            co_occurrence_matrix[idx_b, idx_a] = count
        
    return co_occurrence_matrix, word_list, word_to_index

In [None]:
def make_Graph_from_cooccurrence(cooc_matrix:np.ndarray, word_list:list, threshold:int=5):
    G = nx.Graph()
    
    for i, word_a in enumerate(word_list):
        G.add_node(word_a)
        for j, word_b in enumerate(word_list):
            if i >= j:
                continue
            weight = cooc_matrix[i, j]
            if weight >= threshold:
                G.add_edge(word_a, word_b, weight=weight)
    return G

def get_metrics_from_graph(graph_corpus:nx.Graph):
    """
    Get graph metrics from co-occurrence graph
    """
    # A. Global Connectivity (Largest Connected Component)
    if len(graph_corpus) > 0:
        largest_cc = max(nx.connected_components(graph_corpus), key=len)
        lcc_coverage = len(largest_cc) / len(graph_corpus)
    else:
        lcc_coverage = 0
        largest_cc = []

    # B. K-Core Decomposition (Finding the "Hard Core" of the language)
    try:
        core_numbers = nx.core_number(graph_corpus)
        max_k_core = max(core_numbers.values()) if core_numbers else 0
        average_core = sum(core_numbers.values()) / len(core_numbers) if core_numbers else 0
    except:
        max_k_core = 0
        average_core = 0

    return {
        "largest_component_coverage": f"{lcc_coverage:.2%}",
        "max_k_core_value": max_k_core,
        "average_core_connectivity": f"{average_core:.2f}"
    }

In [None]:
corpus = prepare_data(
    file_path="GoodNightGorilla.txt",
    language="english",
    remove_accent=True,
    remove_punct=True,
    keep_apostrophes=False,
    contraction_map=None,
    stop_words=[]
)

print(corpus)

co_occ_m, word_list, word_index = compute_co_occurrence_matrix(corpus, window_size=5)

cooc_df = pd.DataFrame(
    data=co_occ_m,
    index=word_list,
    columns=word_list
)

graph_corpus = make_Graph_from_cooccurrence(co_occ_m, word_list, threshold=1)
print(graph_corpus)

get_metrics_from_graph(graph_corpus)

# save graph to .gexf file
nx.write_gexf(graph_corpus, "good_night_gorilla_cooccurrence.gexf")

In [None]:
# # display graph_corpus
# import matplotlib.pyplot as plt
# plt.figure(figsize=(12, 12))
# pos = nx.spring_layout(graph_corpus, k=0.15)
# nx.draw_networkx_nodes(graph_corpus, pos, node_size=50)
# nx.draw_networkx_edges(graph_corpus, pos, alpha=0.3)
# nx.draw_networkx_labels(graph_corpus, pos, font_size=8)
# plt.title("Co-occurrence Graph from 'Good Night Gorilla'")
# plt.show()


In [None]:
# if len(graph_corpus) > 0:
#     largest_cc = max(nx.connected_components(graph_corpus), key=len)
#     lcc_coverage = len(largest_cc) / len(word_list)
# else:
#     lcc_coverage = 0
#     largest_cc = []
    

In [None]:
def get_parasite_word(co_occurrence_df:pd.DataFrame, percentile_threshold:int=95):
    parasitic_scores = {}
    for word, row in co_occurrence_df.iterrows():
        row_vec = row.values
        row_sum = row_vec.sum()
        if row_sum == 0:
            parasitic_scores[word] = 0
            continue
        probs = row_vec / row_sum
        score = entropy(probs, base=len(probs))
        parasitic_scores[word] = score
    scores_series = pd.Series(parasitic_scores).sort_values(ascending=False)
    cutoff_value = np.percentile(scores_series, percentile_threshold)
    bad_words = scores_series[scores_series >= cutoff_value].index.tolist()
    return bad_words, scores_series


In [None]:
co_occ_m, word_list, word_index = compute_co_occurrence_matrix(corpus, window_size=4)
co_occ_m_5, _, _ = compute_co_occurrence_matrix(corpus, window_size=8)
cooc_df_5 = pd.DataFrame(
    data=co_occ_m_5,
    index=word_list,
    columns=word_list
)
np.fill_diagonal(cooc_df_5.values, 0)
_, all_score_5 = get_parasite_word(cooc_df_5)

cooc_df = pd.DataFrame(
    data=co_occ_m,
    index=word_list,
    columns=word_list
)

cooc_df_diag_zero = cooc_df.copy()
np.fill_diagonal(cooc_df_diag_zero.values, 0)

bad_word, all_score = get_parasite_word(cooc_df_diag_zero)

res = [[x for x in sub if x not in bad_word] for sub in corpus]
print(res)
co_occ_m1, word_list1, word_index1 = compute_co_occurrence_matrix(res, window_size=2)
cooc_df1 = pd.DataFrame(
    data=co_occ_m1,
    index=word_list1,
    columns=word_list1
)
cooc_df_diag_zero1 = cooc_df1.copy()
np.fill_diagonal(cooc_df_diag_zero1.values, 0)
bad_word1, all_score1 = get_parasite_word(cooc_df_diag_zero1)

In [None]:
w_test = "across"
print(all_score[w_test])
print(cooc_df[w_test].values)
print(cooc_df[w_test].idxmax(), cooc_df[w_test].max())

In [None]:
def PPMI(co_occurrence_matrix:np.ndarray) -> np.ndarray:
    """
    Compute Positive Pointwise Mutual Information (PPMI) matrix.
    """
    total = np.sum(co_occurrence_matrix)
    row_sums = np.sum(co_occurrence_matrix, axis=1)
    col_sums = np.sum(co_occurrence_matrix, axis=0)
    
    expected = np.outer(row_sums, col_sums) / total
    
    with np.errstate(divide='ignore', invalid='ignore'):
        pmi = np.log(co_occurrence_matrix / expected)
    
    pmi[pmi < 0] = 0
    pmi = np.nan_to_num(pmi)
    
    return pmi

In [None]:
cooc_df_wihtout_stop_word = cooc_df_diag_zero.copy()
drops = [w for w in bad_word if w in cooc_df_wihtout_stop_word.index]
cooc_df_wihtout_stop_word = cooc_df_wihtout_stop_word.drop(index=drops, columns=drops)

ppmi = PPMI(cooc_df_wihtout_stop_word.values)

ppmi_df = pd.DataFrame(
    data=ppmi,
    index=cooc_df_wihtout_stop_word.index,
    columns=cooc_df_wihtout_stop_word.columns
)

G = nx.from_pandas_adjacency(ppmi_df)
nx.write_gexf(G, "good_night_gorilla_PPMI.gexf")

nx.community.louvain_communities(G=G)


ppmi_without_traitement = PPMI(cooc_df_5.values)
ppmi_df_without_traitement = pd.DataFrame(
    data=ppmi_without_traitement,
    index=cooc_df_5.index,
    columns=cooc_df_5.columns
)

G = nx.from_pandas_adjacency(ppmi_df_without_traitement)
nx.write_gexf(G, "good_night_gorilla_PPMI_without_traitement_size8.gexf")


In [None]:
cooc_df_wihtout_stop_word = cooc_df_diag_zero1.copy()
drops = [w for w in bad_word if w in cooc_df_wihtout_stop_word.index]
cooc_df_wihtout_stop_word = cooc_df_wihtout_stop_word.drop(index=drops, columns=drops)

ppmi = PPMI(cooc_df_wihtout_stop_word.values)

ppmi_df = pd.DataFrame(
    data=ppmi,
    index=cooc_df_wihtout_stop_word.index,
    columns=cooc_df_wihtout_stop_word.columns
)

G = nx.from_pandas_adjacency(ppmi_df)
nx.write_gexf(G, "good_night_gorilla_PPMI.gexf")

nx.community.louvain_communities(G=G)



In [None]:
word_a = "gorilla"
word_b = "zookeeper"
ppmi = ppmi_df

vec_a = ppmi.loc[word_a].values.reshape(1, -1)
vec_b = ppmi.loc[word_b].values.reshape(1, -1)
score = cosine_similarity(vec_a, vec_b)[0][0]
print(f"Cosine Similarity between '{word_a}' and '{word_b}': {score:.4f}")

In [None]:
word_a = "gorilla"
word_b = "animal"
ppmi = ppmi_df

vec_a = ppmi.loc[word_a].values.reshape(1, -1)
vec_b = ppmi.loc[word_b].values.reshape(1, -1)
score = cosine_similarity(vec_a, vec_b)[0][0]
print(f"Cosine Similarity between '{word_a}' and '{word_b}': {score:.4f}")

In [None]:
word_a = "lion"
word_b = "hyena"
ppmi = ppmi_df

vec_a = ppmi.loc[word_a].values.reshape(1, -1)
vec_b = ppmi.loc[word_b].values.reshape(1, -1)
score = cosine_similarity(vec_a, vec_b)[0][0]
print(f"Cosine Similarity between '{word_a}' and '{word_b}': {score:.4f}")

In [None]:
word_a = "lion"
word_b = "little"
ppmi = ppmi_df

vec_a = ppmi.loc[word_a].values.reshape(1, -1)
vec_b = ppmi.loc[word_b].values.reshape(1, -1)
score = cosine_similarity(vec_a, vec_b)[0][0]
print(f"Cosine Similarity between '{word_a}' and '{word_b}': {score:.4f}")

In [None]:
word_a = "lion"
word_b = "giraffe"
ppmi = ppmi_df

vec_a = ppmi.loc[word_a].values.reshape(1, -1)
vec_b = ppmi.loc[word_b].values.reshape(1, -1)
score = cosine_similarity(vec_a, vec_b)[0][0]
print(f"Cosine Similarity between '{word_a}' and '{word_b}': {score:.4f}")

In [None]:
def find_nearest_neighbors(target_word, ppmi_df, top_n=5):
    target_vec = ppmi_df.loc[target_word].values.reshape(1, -1)
    all_scores = cosine_similarity(ppmi_df.values, target_vec)
    score_series = pd.Series(all_scores.flatten(), index=ppmi_df.index)
    top_words = score_series.sort_values(ascending=False).drop(target_word).head(top_n)
    
    return top_words

In [None]:
word_a = "banana"
nearest_neighbors = find_nearest_neighbors(word_a, ppmi_df, top_n=10)
print(f"Nearest Neighbors to '{word_a}':")
print(nearest_neighbors)

In [None]:
nearest_neighbors = find_nearest_neighbors("gorilla", ppmi_df, top_n=10)
print("Nearest Neighbors to 'gorilla':")
print(nearest_neighbors)

In [None]:
word_a = "little"
nearest_neighbors = find_nearest_neighbors(word_a, ppmi_df, top_n=10)
print(f"Nearest Neighbors to '{word_a}':")
print(nearest_neighbors)

In [None]:
word_a = "mouse"
nearest_neighbors = find_nearest_neighbors(word_a, ppmi_df, top_n=10)
print(f"Nearest Neighbors to '{word_a}':")
print(nearest_neighbors)

In [None]:
def remove_accents(text: str) -> str:
    """Normalizes text to remove accents (e.g., 'café' -> 'cafe')."""
    nk = unicodedata.normalize("NFKD", text)
    return "".join(ch for ch in nk if not unicodedata.combining(ch))

def prepare_data_with_intonation(
    file_path: str,
    language: str,
    remove_accent: bool = True,
    remove_punct: bool = True,
    keep_apostrophes: bool = True,
    contraction_map: Optional[Dict[str, str]] = None,
    stop_words: Optional[List[str]] = None,
    break_line: bool = True,
    expand_is_contraction: bool = True
    ) -> List[List[str]]:

    sentence_split_re = re.compile(r'[\.!\?]+')
    
    contraction_re = None
    if contraction_map:
        pattern = "|".join(re.escape(k) for k in sorted(contraction_map.keys(), reverse=True))
        contraction_re = re.compile(f"({pattern})")

    punctuation_chars = set(string.punctuation)
    if keep_apostrophes or expand_is_contraction:
        punctuation_chars -= {"'", "’"}
    
    punct_trans_table = str.maketrans({c: " " for c in punctuation_chars})
    stop_words_set: Set[str] = set(stop_words) if stop_words else set()
    tokens_by_sentence: List[List[str]] = []
    
    with open(file_path, encoding="utf-8") as f:
        for line in f:
            sub_lines = sentence_split_re.split(line.strip().lower()) if break_line else [line.strip().lower()]
            
            for s in sub_lines:
                if not s: continue
                
                if contraction_re:
                    s = contraction_re.sub(lambda m: contraction_map[m.group(0)], s)
                    
                s = s.replace("-", "")
                s = s.replace("—", " ")
                
                if remove_accent:
                    s = remove_accents(s) 

                if remove_punct:
                    s = s.translate(punct_trans_table)

                toks = word_tokenize(s, language=language)

                if expand_is_contraction and language == 'english':
                    tagged = nltk.pos_tag(toks)
                    new_toks = []
                    for word, tag in tagged:
                        if tag == 'POS': continue # Remove possession
                        elif word in ["'s", "’s"] and tag == 'VBZ':
                            new_toks.append("is")
                        else:
                            new_toks.append(word)
                    toks = new_toks

                clean_toks = []
                for t in toks:
                    t_stripped = t.strip("'’")
                    if t_stripped and t_stripped not in stop_words_set:
                        clean_toks.append(t_stripped)
                
                if clean_toks:
                    tokens_by_sentence.append(clean_toks)

    return tokens_by_sentence

def separate_text_intonation(data:List[List[str]]):
    texts = []
    intonations = []
    for sentence in data:
        intonation = sentence[1::2]
        text = sentence[::2]
        if all(t.isalpha() for t in text) and all(t.isdigit() for t in intonation):
            texts.append(text)
            intonations.append(intonation)
        else:
            print("Warning: Mismatched text and intonation in sentence:", sentence)
            print("Extracted text:", text)
            print("Extracted intonation:", intonation)
            for t in text:
                if not t.isalpha():
                    print(" Non-alpha text token:", t)
            for i in intonation:
                if not i.isdigit():
                    print(" Non-digit intonation token:", i)
            
    return texts, intonations

In [None]:
data = prepare_data_with_intonation(
    file_path="GoodNightGorilla_Intonation.txt",
    language='english',
    remove_accent=True,
    remove_punct=True,
    keep_apostrophes=False,
    contraction_map={
        "that's" : "thatis",
        "it's" : "itis",
        "don't": "donot",
        "doesn't": "doesnot",},
    stop_words=["s", "n't"],
    break_line=False
)
for s in data:
    print(s)
texts, intonations = separate_text_intonation(data)

In [None]:
text_without_0intonation = []
intonation_without_0intonation = []

for sentence_t, sentence_i in zip(texts, intonations):
    text_without_0intonation.append([])
    intonation_without_0intonation.append([])
    for t, i in zip(sentence_t, sentence_i):
        if int(i) != 0:
            text_without_0intonation[-1].append(t)
            intonation_without_0intonation[-1].append(i)
            
occ_m, word_list, word_to_index = compute_co_occurrence_matrix(text_without_0intonation, window_size=4)
ooc_df = pd.DataFrame(
    data=occ_m,
    index=word_list,
    columns=word_list
)

# Put diagonal to zero
np.fill_diagonal(ooc_df.values, 0)

ppmi_without_0intonation = PPMI(ooc_df.values)

# Graph from PPMI without 0 intonation
ppmi_df_without_0intonation = pd.DataFrame(
    data=ppmi_without_0intonation,
    index=ooc_df.index,
    columns=ooc_df.columns
)

filtered_df = ppmi_df_without_0intonation.where(ppmi_df_without_0intonation >= 1, 0)
G = nx.from_pandas_adjacency(filtered_df)
nx.write_gexf(G, "good_night_gorilla_PPMI_without_0intonation.gexf")


In [None]:
word_a = "banana"
df = ppmi_df_without_0intonation
nearest_neighbors = find_nearest_neighbors(word_a, df, top_n=10)
print(f"Nearest Neighbors to '{word_a}':")
print(nearest_neighbors)

In [None]:
word_a = "gorilla"
df = ppmi_df_without_0intonation
nearest_neighbors = find_nearest_neighbors(word_a, df, top_n=10)
print(f"Nearest Neighbors to '{word_a}':")
print(nearest_neighbors)

In [None]:
word_a = "little"
df = ppmi_df_without_0intonation
nearest_neighbors = find_nearest_neighbors(word_a, df, top_n=10)
print(f"Nearest Neighbors to '{word_a}':")
print(nearest_neighbors)