In [20]:
import pandas as pd
import nltk
from nltk.collocations import *


# Find collocations of a word

## Load texts

In [4]:
df = pd.read_csv('data/severian_plaintext.csv')

In [5]:
select = df.loc[df['urn'] == 'pta0001.pta003.pta-grc1', 'text']

In [6]:
selected_data = "".join(select.values)

## Define function

In [45]:
# Function to get all occurrences of top ngrams with frequency in a window of x words
def get_ngram_info(tokens, ngrams, frequency, top, window):
    if ngrams == "bigram":
        finder = BigramCollocationFinder.from_words(tokens)
        measures = nltk.collocations.BigramAssocMeasures()
    elif ngrams == "trigram":
        finder = TrigramCollocationFinder.from_words(tokens)
        measures = nltk.collocations.TrigramAssocMeasures()
    elif ngrams == "fourgram":
        finder = QuadgramCollocationFinder.from_words(tokens)
        measures = nltk.collocations.QuadgramAssocMeasures()
    else:
        print("Not allowed")
    finder.apply_freq_filter(frequency)
    ngrams_list = finder.nbest(measures.pmi, top) 
    ngram_info = []
    for ngram in ngrams_list:
        # Find all positions of the first word of the bigram
        first_word_indices = [i for i, token in enumerate(tokens) if token == ngram[0]]
        
        for first_word_index in first_word_indices:
            # Get the context
            start_index = max(0, first_word_index - window)
            end_index = min(len(tokens), first_word_index + window + 1)
            context = " ".join(tokens[start_index:end_index])
            
            # Append the bigram, its position, and context to the list
            ngram_info.append({
                'ngram': ngram,
                'position': first_word_index,
                'context': context
            })
    
    return ngram_info


## Analyse

In [None]:
# Tokenize the text
tokens = selected_data.lower().split()  # Convert to lower case for uniformity


In [46]:

# Get bigram information
ngram_info = get_ngram_info(tokens, ngrams='trigram', frequency=5, top=10, window=5)

# Show results
for info in ngram_info:
    print(f"Ngram: {info['ngram']}, Position: {info['position']}, Context: {info['context']}")


Ngram: ('ἰησοῦς', 'ποιεῖν', 'τε'), Position: 763, Context: φόβον τῶν ἰουδαίων, ἦλθεν ὁ ἰησοῦς καὶ ἔστη εἰς τὸ μέσον.
Ngram: ('ἰησοῦς', 'ποιεῖν', 'τε'), Position: 2000, Context: τῆς ἀναστάσεως, ἦλθε πάλιν ὁ ἰησοῦς τῶν θυρῶν κεκλεισμένων καὶ ἔστη
Ngram: ('ἰησοῦς', 'ποιεῖν', 'τε'), Position: 2911, Context: ὦ θεόφιλε, ὧν ἤρξατο ὁ ἰησοῦς ποιεῖν τε καὶ διδάσκειν. οὐ
Ngram: ('ἰησοῦς', 'ποιεῖν', 'τε'), Position: 2935, Context: παῦλος, ἀλλʼ ὧν ἤρξατο ὁ ἰησοῦς ποιεῖν τε καὶ διδάσκειν. δῆλον
Ngram: ('ἰησοῦς', 'ποιεῖν', 'τε'), Position: 2971, Context: ὦ θεόφιλε, ὧν ἤρξατο ὁ ἰησοῦς ποιεῖν τε καὶ διδάσκειν, ἄχρι
Ngram: ('ἰησοῦς', 'ποιεῖν', 'τε'), Position: 3273, Context: ὦ θεόφιλε, ὧν ἤρξατο ὁ ἰησοῦς ποιεῖν τε καὶ διδάσκειν, ἄχρι
Ngram: ('ἰησοῦς', 'ποιεῖν', 'τε'), Position: 3498, Context: ὦ θεόφιλε, ὧν ἤρξατο ὁ ἰησοῦς ποιεῖν τε καὶ διδάσκειν. καὶ
Ngram: ('ἰησοῦς', 'ποιεῖν', 'τε'), Position: 8037, Context: τί ἑστήκατε ἐμβλέποντες; οὗτος ὁ ἰησοῦς ὁ ἀναληφθεὶς ἀφʼ ὑμῶν. ἐπειδὴ
Ngram: ('ἰησοῦς', 'ποιεῖν