In [1]:

%pprint
import sys
from os import path as osp
sys.path.insert(1, osp.abspath(osp.join('..', 'py')))
from nt_utils import TranscriptionUtilities

tu = TranscriptionUtilities(
    data_folder_path=osp.abspath('../data'),
    saves_folder_path=osp.abspath('../saves')
)

Pretty printing has been turned OFF



----
## Here's a manual transcription of the first few paragraphs of "Egregores, Mobs, and Demons"

In [3]:

file_path = osp.join(tu.saves_text_folder, 'Egregores_Mobs_and_Demons_with_Jordan_Hall_John_Vervaeke.txt')
with open(file_path, 'r', encoding='utf-8') as f:
    essay_str = '\n\n'.join(f.read().split('\n\n')[1:3])
print(essay_str)

We'll have to be generous with me in this conversation. Sometimes I'll stop you and say, "I don't know what that means." Maybe I can't even broaden it. My sense is that we'll want to be consciously moving quite slowly because the thing we're dealing with is, in some sense, the most esoteric thing possible. Language itself isn't going to be an adequate tool for addressing it.

Yeah, that makes sense. We've got to listen to each other but follow the logos. That's going to require all of our virtuosity and virtue. This conversation was sparked by a discussion, Jordan, that you had with a few other people on Rebel Wisdom about egregors. The notion of egregors has been floating around. I've seen different people talk about it, especially because everybody seems to be reading Tomberg's *Meditations on the Tarot* right now, which I'm reading at this moment. He also talks about egregors. This spawned several articles on my website from people in my mind space, talking about higher beings and c


## Let's try pulling out all the noun phrases

In [13]:

from collections import Counter
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.corpus import words as nltk_words
from nltk.tokenize import word_tokenize
from nltk.tree import Tree
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import numpy as np
import string

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to C:\Users\daveb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\daveb/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\daveb/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to C:\Users\daveb/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [8]:

def extract_noun_phrases(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Perform part-of-speech tagging
    tagged = pos_tag(tokens)
    
    # Perform named entity recognition
    chunked = ne_chunk(tagged)
    
    # Extract noun phrases
    noun_phrases = []
    
    for subtree in chunked:
        if isinstance(subtree, Tree):
            if subtree.label() == 'NP':
                noun_phrases.append(' '.join([token for token, pos in subtree.leaves()]))
            else:
                noun_phrases.append(' '.join([token for token, pos in subtree.leaves()]))
        elif subtree[1].startswith('NN'):
            noun_phrases.append(subtree[0])
    
    return noun_phrases

In [9]:

# Example usage
with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read().strip()

noun_phrases = extract_noun_phrases(text)

print("Extracted noun phrases:")
for phrase in noun_phrases:
    print(f"- {phrase}")

Extracted noun phrases:
- Jonathan Pageau
- Welcome
- Symbolic World
- things
- conversation
- clarity
- Jordan
- Part
- set
- terms
- ones
- John
- terms
- background
- something
- conversation
- conversation
- sense
- thing
- sense
- thing
- tool
- sense
- logos
- virtuosity
- virtue
- conversation
- discussion
- Jordan
- people
- Rebel Wisdom
- egregors
- notion
- egregors
- people
- everybody
- Tomberg
- Meditations
- Tarot
- *
- moment
- egregors
- articles
- website
- people
- mind
- space
- beings
- interest
- agency
- levels
- agency
- interests
- days
- John
- part
- work
- Jordan
- conversation
- question
- wisdom
- Very
- agency
- material
- sense
- category
- degree
- category
- reality
- asymmetry
- lines
- mainstream
- sorts
- notions
- head
- years
- novice
- category
- audience
- contributor
- participatory
- audience
- papers
- Dan Chiappi
- journals
- cognition
- deeply
- Cognition
- work
- Chalmers
- Clark
- time
- Ed Hutchins
- work
- cognition
- cognition
- system


In [10]:

def extract_tfidf_phrases(text, min_length=3, max_length=5):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    chunked = ne_chunk(tagged)
    
    noun_phrases = []
    
    for i in range(len(tokens)):
        for j in range(i + min_length, min(i + max_length + 1, len(tokens) + 1)):
            phrase = ' '.join(tokens[i:j])
            if min_length <= len(phrase.split()) <= max_length:
                noun_phrases.append(phrase)
    
    return list(set(noun_phrases))  # Remove duplicates

In [11]:

def get_top_tfidf_phrases(file_path, num_phrases=10, min_length=3, max_length=5):
    # Read the main document
    with open(file_path, 'r', encoding='utf-8') as f:
        main_text = f.read().strip()
    
    # Extract noun phrases from the main document
    main_phrases = extract_tfidf_phrases(main_text, min_length, max_length)
    
    # Create a corpus of documents (you may want to add more documents here)
    corpus = [main_text]
    
    # Add some generic text to the corpus for comparison
    generic_texts = [
        "The quick brown fox jumps over the lazy dog.",
        "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
        "Python is a high-level programming language.",
        "Machine learning is a subset of artificial intelligence.",
    ]
    corpus.extend(generic_texts)
    
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(ngram_range=(min_length, max_length))
    tfidf_matrix = vectorizer.fit_transform(corpus)
    
    # Get feature names (phrases)
    feature_names = vectorizer.get_feature_names_out()
    
    # Get TF-IDF scores for the main document (first in the corpus)
    tfidf_scores = tfidf_matrix[0].toarray()[0]
    
    # Create a dictionary of phrases and their TF-IDF scores
    phrase_scores = {phrase: score for phrase, score in zip(feature_names, tfidf_scores)}
    
    # Filter scores to include only the extracted noun phrases
    filtered_scores = {phrase: score for phrase, score in phrase_scores.items() if phrase in main_phrases}
    
    # Sort phrases by TF-IDF score in descending order
    sorted_phrases = sorted(filtered_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Return the top N phrases
    return sorted_phrases[:num_phrases]

In [12]:

# Example usage
top_phrases = get_top_tfidf_phrases(file_path, num_phrases=10, min_length=3, max_length=5)

print("Top distinctive phrases (3 to 5 words):")
for phrase, score in top_phrases:
    print(f"- {phrase}: {score:.4f}")

Top distinctive phrases (3 to 5 words):
- the notion of: 0.0553
- one of the: 0.0452
- the top down: 0.0452
- have to be: 0.0402
- this is what: 0.0402
- part of the: 0.0352
- the idea of: 0.0352
- we can do: 0.0352
- we have to: 0.0352
- what it is: 0.0352


In [24]:

def get_rare_words(file_path, min_length=5, max_words=20):
    # Read the text file
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read().lower()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove punctuation and numbers
    tokens = [word for word in tokens if word.isalpha()]

    # Get word frequencies
    word_freq = Counter(tokens)

    # Get a set of common English words
    common_words = set(word.lower() for word in nltk_words.words())

    # Filter and rank words
    rare_words = []
    for word, freq in word_freq.items():
        if len(word) >= min_length and word not in common_words:
            
            # Score is based on word length and frequency
            score = len(word)**2 * freq
            rare_words.append((word, score))

    # Sort rare words by score in descending order
    rare_words.sort(key=lambda x: x[1], reverse=True)

    return rare_words[:max_words]

In [25]:

# Example usage
import pyperclip

rare_words = get_rare_words(file_path)

print("Top rare words:")
words_list = []
for word, score in rare_words:
    print(f"- {word}: {score:.2f}")
    words_list.append(word)
pyperclip.copy(r'\b(' + '|'.join(words_list) + r')\b')

Top rare words:
- things: 1332.00
- beings: 900.00
- autopoietic: 847.00
- agents: 792.00
- represented: 726.00
- kairos: 720.00
- processing: 700.00
- happens: 637.00
- processes: 567.00
- bringing: 512.00
- challenging: 484.00
- structures: 400.00
- entities: 384.00
- differences: 363.00
- virtues: 343.00
- bureaucracies: 338.00
- participating: 338.00
- relationships: 338.00
- conversations: 338.00
- creating: 320.00
