In [1]:
import stanza, re, benepar, psutil, gc, json, csv, time

import numpy as np
import networkx as nx
import pandas as pd
import seaborn as sns

from textstat import textstat
from datetime import datetime
from collections import Counter

import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk import Tree

import matplotlib.pyplot as plt
from matplotlib.ticker import LogLocator, LogFormatter

In [None]:
# These only need to run once
# stanza.download('en')
# spacy.cli.download("en_core_web_sm")
# nltk.download('punkt_tab')
# benepar.download('benepar_en3')

In [2]:
def memcheck():
    gc.collect()
    memory_info = psutil.virtual_memory()

    # Display the memory information in GB
    total_memory = memory_info.total / (1024 ** 3)
    available_memory = memory_info.available / (1024 ** 3)
    used_memory = memory_info.used / (1024 ** 3)

    print(f"Total Memory: {total_memory:.2f} GB")
    print(f"Available Memory: {available_memory:.2f} GB")
    print(f"Used Memory: {used_memory:.2f} GB")

In [None]:
def lexical_richness(text):
    tokens = word_tokenize(text)
    fdist = FreqDist(tokens)
    type_token_ratio = len(fdist) / len(tokens)
    return type_token_ratio

# test it
test_text = "The quick brown fox jumps over the lazy dog."
richness = lexical_richness(test_text)
print(f"Lexical Richness (Type-Token Ratio): {richness}")


In [3]:
def sentence_generator(text):
    """
    Generates sentences from a large text string one at a time.

    :param text: The large text string
    :yield: Each sentence as a stanza Sentence object
    """
    buffer = ""
    lines = text.splitlines()

    for line in lines:
        buffer += line.strip() + " "
        doc_buffer = nlp_tokens(buffer)
        
        for sentence in doc_buffer.sentences:
            yield sentence
            
        # Clear the buffer if it's been processed
        if doc_buffer.sentences:
            buffer = ""

    # Process any remaining sentences in the buffer
    if buffer.strip():
        doc_buffer = nlp_tokens(buffer)
        for sentence in doc_buffer.sentences:
            yield sentence

def count_sentences_in_string(text):
    """
    Counts the number of sentences in a large text string using a generator.

    :param text: The large text string
    :return: Total number of sentences in the text
    """
    sentence_count = sum(1 for _ in sentence_generator(text))
    return sentence_count

In [None]:

def plot_sentence_graph(sentence):
    # plots graph of a stanza sentence object
    # Initialize a directed graph
    G = nx.DiGraph()

    # Add nodes and edges from the dependency parse
    for word in sentence.words:
        G.add_node(word.id, label=word.text)
        if word.head != 0:  # If the word has a head, add an edge
            G.add_edge(word.head, word.id, label=word.deprel)

    # Draw the graph
    pos = nx.spring_layout(G)  # Positioning of nodes
    labels = nx.get_node_attributes(G, 'label')
    edge_labels = nx.get_edge_attributes(G, 'label')

    plt.figure(figsize=(10, 7))
    nx.draw(G, pos, labels=labels, with_labels=True, node_size=3000, node_color='lightblue', font_size=10, font_weight='bold', arrows=True)
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=9)

    plt.show()


In [4]:
def reading_ease(text):

    flesch_reading_ease = textstat.flesch_reading_ease(text)
    flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
    gunning_fog = textstat.gunning_fog(text)
    coleman_liau_index = textstat.coleman_liau_index(text)

    print(f"Flesch Reading Ease: {flesch_reading_ease}")
    print(f"Flesch-Kincaid Grade Level: {flesch_kincaid_grade}")
    print(f"Gunning Fog Index: {gunning_fog}")
    print(f"Coleman-Liau Index: {coleman_liau_index}")


nlp files to be created (as needed) every time the notebook is launched (or the kernel is restarted):

In [5]:
nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse,sentiment,ner')
nlp_tokens = stanza.Pipeline(lang='en', processors='tokenize', use_gpu=False)

2024-08-31 12:14:24 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-08-31 12:14:25 INFO: Downloaded file to C:\Users\Roland\stanza_resources\resources.json
2024-08-31 12:14:26 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| pos       | combined_charlm           |
| lemma     | combined_nocharlm         |
| depparse  | combined_charlm           |
| sentiment | sstplus_charlm            |
| ner       | ontonotes-ww-multi_charlm |

2024-08-31 12:14:26 INFO: Using device: cpu
2024-08-31 12:14:26 INFO: Loading: tokenize
2024-08-31 12:14:28 INFO: Loading: mwt
2024-08-31 12:14:28 INFO: Loading: pos
2024-08-31 12:14:28 INFO: Loading: lemma
2024-08-31 12:14:28 INFO: Loading: depparse
2024-08-31 12:14:28 INFO: Loading: sentiment
2024-08-31 12:14:29 INFO: Loading: ner
2024-08-31 12:14:29 INFO: Done loading processors!
2024-08-31 12:14:29 INFO: Checking for updates to resources.json in

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-08-31 12:14:30 INFO: Downloaded file to C:\Users\Roland\stanza_resources\resources.json
2024-08-31 12:14:30 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

2024-08-31 12:14:30 INFO: Using device: cpu
2024-08-31 12:14:30 INFO: Loading: tokenize
2024-08-31 12:14:30 INFO: Loading: mwt
2024-08-31 12:14:30 INFO: Done loading processors!


In [6]:
nlp_spacy = spacy.load('en_core_web_sm')

# Add the benepar parser to the pipeline

# tokenizer = T5Tokenizer.from_pretrained('t5-small', clean_up_tokenization_spaces=True)  
# default is currently True but will change to False

#nlp_spacy.add_pipe("benepar", config={"model": "benepar_en3"})

In [7]:
source_texts=["Kazuo Ishiguro - Never Let Me Go.txt", "Kazuo Ishiguro - The Remains of the Day"]
directory_path = "C:/Users/Roland/Documents/AI/stylometry/"
file_path = directory_path+source_texts[1]+".txt"
print (file_path)

C:/Users/Roland/Documents/AI/stylometry/Kazuo Ishiguro - The Remains of the Day.txt


In [8]:
encodings = ['utf-8', 'ascii', 'latin-1','windows-1252', 'iso-8859-1', 'iso-8859-15']
for i, encoding in enumerate(encodings):
    try:
        with open(file_path, 'r', encoding=encoding) as file:
            test_text = file.read()
        print("""Successfully read the file with encoding: {encoding} (encoding index {ind}).  This gives the beginning as:\n{content}\n""".format(encoding=encoding, ind=i, content=test_text[0:200].replace("\n", "  ")))    
    except UnicodeDecodeError:
        print(f"Failed to read with encoding: {encoding}")

Failed to read with encoding: utf-8
Failed to read with encoding: ascii
Successfully read the file with encoding: latin-1 (encoding index 2).  This gives the beginning as:
    Prologue: July 1956    Darlington Hall    IT seems increasingly likely that I really will undertake the expedition that has been preoccupying my imagination now for some days. An expedition, I should sa

Successfully read the file with encoding: windows-1252 (encoding index 3).  This gives the beginning as:
    Prologue: July 1956    Darlington Hall    IT seems increasingly likely that I really will undertake the expedition that has been preoccupying my imagination now for some days. An expedition, I should sa

Successfully read the file with encoding: iso-8859-1 (encoding index 4).  This gives the beginning as:
    Prologue: July 1956    Darlington Hall    IT seems increasingly likely that I really will undertake the expedition that has been preoccupying my imagination now for some days. An expedition, I should 

Now load text file with selected encoding

In [49]:
encoding = encodings[2]
with open(file_path, 'r', encoding=encoding) as file:
    text = file.read()

In [50]:
print(f"Total characters in book:  {len(text)}")
sentence_count = count_sentences_in_string(text)
print(f"Total number of sentences: {sentence_count}")

Total characters in book:  423697
Total number of sentences: 4133


In [11]:
# process the text with stanza 

# Record the start time
start_time = datetime.now()
print(f"Starting processing at: {start_time.strftime('%H:%M:%S')}")

memcheck()
doc_stanza = nlp_stanza(text)
memcheck()

end_time = datetime.now()
print(f"Completed parsing at: {end_time.strftime('%H:%M:%S')}")

time_taken = end_time - start_time

# Print the time taken in hours, minutes, and seconds
hours, remainder = divmod(time_taken.seconds, 3600)
minutes, seconds = divmod(remainder, 60)
print(f"Time Taken: {hours} hours, {minutes} minutes, {seconds} seconds")

print(f"\ndoc_stanza has {len(doc_stanza.sentences)} sentences.")


Starting processing at: 12:15:48
Total Memory: 127.91 GB
Available Memory: 99.10 GB
Used Memory: 28.82 GB
Total Memory: 127.91 GB
Available Memory: 98.85 GB
Used Memory: 29.06 GB
Completed parsing at: 12:24:32
Time Taken: 0 hours, 8 minutes, 43 seconds

doc_stanza has 4132 sentences.


In [None]:
doc = doc_stanza
lemmas_with_punct = [word.lemma for sentence in doc.sentences for word in sentence.words]
lemmas = [word.lemma for sentence in doc.sentences for word in sentence.words if word.upos != 'PUNCT']
print(len(lemmas))
lemma_counts = Counter(lemmas)
lemma_list = list(lemma_counts.items())
sorted_lemma_list = sorted(lemma_list, key=lambda x: x[1], reverse=True)
print(sorted_lemma_list[:20])

In [82]:

# Example text
test_text = "On seeing my person, he took the opportunity to inform me that he had just that moment finalized plans to return to the United States for a period of five weeks between August and September. 'Bring me the head of John the Baptist, said King Charles'"
# Process the text
test_doc = nlp_stanza(test_text)

# Create a list to hold the modified lemmas with placeholders
lemmas_with_placeholders = []

# Iterate over sentences
for sentence in test_doc.sentences:
    # Create a set of named entity spans to avoid overlapping replacements
    entity_spans = [(entity.start_char, entity.end_char, entity.type) for entity in sentence.ents]
    
    # Initialize the last position
    last_pos = 0

    # Iterate over tokens in the sentence
    for token in sentence.tokens:
        for word in token.words:
            start_char = sentence.text.index(word.text)
            end_char = start_char + len(token.text)

            # Replace named entities with placeholders
            is_entity = False
            for start, end, ent_type in entity_spans:
                if start_char >= start and end_char <= end:
                    placeholder = f"<{ent_type}>"
                    if not is_entity:
                        lemmas_with_placeholders.append(placeholder)
                        is_entity = True
                    break

            if not is_entity:
                # Append token lemma if it’s not part of a named entity and not punctuation
                if token.words[0].pos != 'PUNCT':  # Check POS tag for punctuation
                    lemmas_with_placeholders.append(token.words[0].lemma)

# Print the modified lemmas with placeholders
print(lemmas_with_placeholders)


['on', 'see', 'my', 'person', 'he', 'take', 'the', 'opportunity', 'to', 'inform', 'I', 'that', 'he', 'have', 'just', 'that', 'moment', 'finalize', 'plan', 'to', 'return', 'to', 'the', '<GPE>', '<GPE>', 'for', 'a', 'period', 'of', '<DATE>', '<DATE>', 'between', '<DATE>', 'and', '<DATE>', 'bring', 'I', 'the', 'head', 'of', 'John', 'the', 'Baptist', 'say', 'King', 'Charles']


In [None]:
import string

# Generate lemma list including punctuation
lemmas_with_punctuation = [word.lemma for sentence in doc.sentences for word in sentence.words]

# Generate bigrams excluding those with punctuation
bigrams = [
    (lemmas_with_punctuation[i], lemmas_with_punctuation[i+1])
    for i in range(len(lemmas_with_punctuation) - 1)
    if lemmas_with_punctuation[i] not in string.punctuation and lemmas_with_punctuation[i+1] not in string.punctuation
]

# Count the occurrences of each lemma
from collections import Counter
lemma_counts = Counter(lemmas_with_punctuation)

# Count the occurrences of each bigram
bigram_counts = Counter(bigrams)

# Sort the lemma counts by frequency in descending order
sorted_lemma_tuples = sorted(lemma_counts.items(), key=lambda x: x[1], reverse=True)

# Sort the bigram counts by frequency in descending order
sorted_bigram_tuples = sorted(bigram_counts.items(), key=lambda x: x[1], reverse=True)

# Print the sorted lemma and bigram counts
print("Sorted Lemma Frequencies:")
print(sorted_lemma_tuples)

print("\nSorted Bigram Frequencies:")
print(sorted_bigram_tuples)


In [35]:
for sentence in test_doc.sentences:
    for word in sentence.words:
        if word.pos != word.upos: print(f"Word: {word.text}, UPOS: {word.upos}, XPOS: {word.xpos}, POS: {word.pos}")


In [None]:
# process the text with spacy

# Record the start time
start_time = datetime.now()
print(f"Starting processing at: {start_time.strftime('%H:%M:%S')}")

memcheck()
doc_spacy = nlp_spacy("We know you can hear us. This is the voice of the mysterons.")
memcheck()

end_time = datetime.now()
print(f"Completed parsing at: {end_time.strftime('%H:%M:%S')}")

time_taken = end_time - start_time

# Print the time taken in hours, minutes, and seconds
hours, remainder = divmod(time_taken.seconds, 3600)
minutes, seconds = divmod(remainder, 60)
print(f"Time Taken: {hours} hours, {minutes} minutes, {seconds} seconds")


In [None]:
# Get a sentence in the spacy document
sent = list(doc_spacy.sents)[1]

# Convert the constituency parse to an nltk Tree
tree = Tree.fromstring(sent._.parse_string)

# Plot the tree
tree.pretty_print()

# Optionally, draw the tree using matplotlib
plt.figure(figsize=(12, 8))
tree.draw()

In [None]:
# Get the second sentence in the spacy document
sent = list(doc_spacy.sents)[1]

# Print the parse string to check its structure
print(sent._.parse_string)


In [20]:
#attributes = [attr for attr in dir(test_doc.sentences[0].entities) if not callable(getattr(test_doc.sentences[0].entities, attr))]
#print(attributes)
for entity in test_doc.sentences[0].entities:
    print(entity.text)
    print(entity.type)


the United States
GPE
five weeks
DATE
August
DATE
September
DATE


In [26]:
sentence = test_doc.sentences[0]

In [27]:
print(dir(sentence))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_comments', '_constituency', '_dependencies', '_doc', '_doc_id', '_empty_words', '_enhanced_dependencies', '_ents', '_index', '_process_tokens', '_sent_id', '_sentiment', '_text', '_tokens', '_words', 'add_comment', 'add_property', 'build_dependencies', 'build_ents', 'build_fake_dependencies', 'comments', 'constituency', 'dependencies', 'dependencies_string', 'doc', 'doc_id', 'empty_words', 'entities', 'ents', 'has_enhanced_dependencies', 'id', 'index', 'print_dependencies', 'print_tokens', 'print_words', 'rebuild_dependencies', 'sent_id', 'sentiment', 'text', 'to_dict', 'tokens', 'tokens_string', 'words', 'words_string']


In [28]:
word = sentence.words[0]
dir(word)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_coref_chains',
 '_deprel',
 '_end_char',
 '_feats',
 '_head',
 '_id',
 '_is_null',
 '_lemma',
 '_mexp',
 '_misc',
 '_parent',
 '_sent',
 '_start_char',
 '_text',
 '_upos',
 '_xpos',
 'add_property',
 'coref_chains',
 'deprel',
 'deps',
 'end_char',
 'feats',
 'head',
 'id',
 'lemma',
 'manual_expansion',
 'misc',
 'parent',
 'pos',
 'pretty_print',
 'sent',
 'start_char',
 'text',
 'to_conll_text',
 'to_dict',
 'upos',
 'xpos']

In [29]:
print(dir(word.lemma))

['__add__', '__class__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mod__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'capitalize', 'casefold', 'center', 'count', 'encode', 'endswith', 'expandtabs', 'find', 'format', 'format_map', 'index', 'isalnum', 'isalpha', 'isascii', 'isdecimal', 'isdigit', 'isidentifier', 'islower', 'isnumeric', 'isprintable', 'isspace', 'istitle', 'isupper', 'join', 'ljust', 'lower', 'lstrip', 'maketrans', 'partition', 'removeprefix', 'removesuffix', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'split', 'splitlines', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'zfill']


In [30]:
token = sentence.tokens[0]
print(dir(token))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_end_char', '_id', '_is_null', '_mexp', '_misc', '_multi_ner', '_ner', '_sent', '_spaces_after', '_spaces_before', '_start_char', '_text', '_words', 'add_property', 'consolidate_whitespace', 'end_char', 'id', 'is_mwt', 'manual_expansion', 'misc', 'multi_ner', 'ner', 'pretty_print', 'sent', 'spaces_after', 'spaces_before', 'start_char', 'text', 'to_conll_text', 'to_dict', 'words']


In [32]:
print(token.text)

print (token.words)

On
[{
  "id": 1,
  "text": "On",
  "lemma": "on",
  "upos": "SCONJ",
  "xpos": "IN",
  "head": 2,
  "deprel": "mark",
  "start_char": 0,
  "end_char": 2
}]


In [23]:
print(dir(test_doc.sentences[0].tokens[0]))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_end_char', '_id', '_is_null', '_mexp', '_misc', '_multi_ner', '_ner', '_sent', '_spaces_after', '_spaces_before', '_start_char', '_text', '_words', 'add_property', 'consolidate_whitespace', 'end_char', 'id', 'is_mwt', 'manual_expansion', 'misc', 'multi_ner', 'ner', 'pretty_print', 'sent', 'spaces_after', 'spaces_before', 'start_char', 'text', 'to_conll_text', 'to_dict', 'words']


In [36]:
for sentence in test_doc.sentences:
    for token in sentence.tokens:
        print(token.words[0].lemma, token.ner)

on O
see O
my O
person O
, O
he O
take O
the O
opportunity O
to O
inform O
I O
that O
he O
have O
just O
that O
moment O
finalize O
plan O
to O
return O
to O
the B-GPE
United I-GPE
State E-GPE
for O
a O
period O
of O
five B-DATE
week E-DATE
between O
August S-DATE
and O
September S-DATE
. O
' O
bring O
I O
the O
head O
of O
John B-PERSON
the I-PERSON
Baptist E-PERSON
, O
say O
King O
Charles S-PERSON
' O


In [None]:
ner_labels = set()
for sentence in doc_stanza.sentences:
    for word in sentence.words:
        ner_labels.add(word.ner)
print(len(ner_labels))
#print("NER Labels used:", ner_labels)

In [None]:
for sentence in doc.sentences[:10]:
    print(f"Sentence: {sentence.text}")
    for entity in sentence.ents:
        print(f"Entity: {entity.text}, Type: {entity.type}")

In [None]:
print(doc.sentences[0].words[0].ner)

In [None]:
len(lemmas)
print(lemmas[:100])

In [None]:
# Count the number of verbs in each sentence
verb_counts = []
for sentence in doc.sentences:
    verb_count = sum(1 for word in sentence.words if word.pos == 'VERB')
    verb_counts.append(verb_count)

# Plotting the histogram
plt.hist(verb_counts, bins=range(1, max(verb_counts) + 2), edgecolor='black', align='left')
plt.title('Histogram of the Number of Verbs per Sentence')
plt.xlabel('Number of Verbs')
plt.ylabel('Number of Sentences')
plt.xticks(range(1, max(verb_counts) + 1))
plt.show()


In [None]:


# Example: sorted_lemma_tuples = [('lemma1', count1), ('lemma2', count2), ...]
# Ensure `sorted_lemma_tuples` is sorted by the count in descending order.

# Extract lemmas and their counts
lemmas = [lemma for lemma, count in sorted_lemma_list]
counts = [count for lemma, count in sorted_lemma_list]

# Plotting the bar graph
plt.figure(figsize=(12, 8))  # Increase figure size if necessary
plt.bar(range(len(lemmas)), counts, color='lightcoral')

# Formatting the plot
plt.title('Lemma Frequency Distribution (Log Scale)')
plt.xlabel('Lemmas')
plt.ylabel('Counts (Log Scale)')
plt.yscale('log')  # Set y-axis to logarithmic scale

# Customizing the y-axis ticks
plt.gca().yaxis.set_major_locator(LogLocator(base=10.0, numticks=10))  # Set ticks at 10^0, 10^1, 10^2, etc.
plt.gca().yaxis.set_major_formatter(LogFormatter(base=10.0, labelOnlyBase=False))  # Use base-10 formatting

plt.xticks([])  # Remove x-axis labels
plt.tight_layout()  # Adjust layout to make room for x-axis labels

plt.show()


In [None]:
import matplotlib.pyplot as plt

# Example: sorted_bigram_tuples = [(('lemma1', 'lemma2'), count1), (('lemma3', 'lemma4'), count2), ...]
# Make sure `sorted_bigram_tuples` is sorted by the count in descending order.

# Extract bigrams and their counts
bigrams = [' '.join(bigram) for bigram, count in sorted_bigram_tuples]
counts = [count for bigram, count in sorted_bigram_tuples]

# Plotting the bar graph
plt.figure(figsize=(10, 6))
plt.bar(bigrams, counts, color='skyblue')

# Formatting the plot
plt.title('Bigram Frequency Distribution')
plt.xlabel('Bigrams')
plt.ylabel('Counts')
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to make room for x-axis labels

plt.show()


In [None]:
# Tokenize text with nltk
test_text = "The quick brown fox jumps over the lazy dog."
tokens = word_tokenize(test_text)

# Analyze frequency
fdist = FreqDist(tokens)
print(fdist.most_common())


Textstat:
Readability Metrics: Calculates readability scores such as Flesch-Kincaid, Gunning Fog Index, and others.

Lexical Richness: Measures vocabulary diversity and richness (e.g., Type-Token Ratio, Hapax Legomena).


jupyter labextension install @jupyterlab/statusbar
Once installed, you should see a status bar at the bottom of JupyterLab showing memory usage, CPU load, and kernel status.

In [None]:
# NLTK with Custom Regex Tokenizer

# NLTK allows for more fine-grained control over tokenization using regular expressions. You can define custom patterns to handle specific cases.
# Define a custom tokenizer with regex for contractions and punctuation

tokenizer = RegexpTokenizer(r"[\w]+|[^\w\s]", gaps=False)

# Tokenize a sample sentence
test_text = "Don't stop—keep going! It's fascinating... isn't it?"
tokens = tokenizer.tokenize(test_text)
print(tokens)


SpaCy Custom Tokenizer

SpaCy allows you to modify its default tokenizer or add custom rules to handle specific cases like contractions or punctuation.

In [None]:
# Customize the tokenizer
infixes = nlp_spacy.Defaults.infixes + [r'--', r'\.\.\.']  # Add em dashes and ellipses as infixes
infix_re = compile_infix_regex(infixes)
nlp_spacy.tokenizer.infix_finditer = infix_re.finditer

# Add custom rules for contractions
special_cases = {"don't": [{"ORTH": "do"}, {"ORTH": "n't"}],
                 "it's": [{"ORTH": "it"}, {"ORTH": "'s"}]}
for case, rules in special_cases.items():
    nlp_spacy.tokenizer.add_special_case(case, rules)

# Tokenize a sample sentence
test_text = "Don't stop—keep going! It's fascinating... isn't it?"
doc = nlp_spacy(test_text)
print([token.text for token in doc])


In [None]:
for sentence in doc.sentences[:30]:
    print (sentence.sentiment)


In [None]:
string = " ".join([word.text for word in doc.sentences[5].words])
print(string)
plot_sentence_graph(doc.sentences[5])

In [None]:
test_text = "The quick brown foxes are jumping over the lazy dogs."
test_doc = nlp_spacy(test_text)

# Tokenization and Lemmatization
for token in test_doc:
    print(f"Token: {token.text}, Lemma: {token.lemma_}")


Post-Processing Tokens

After tokenization, you might want to refine the tokens to ensure accuracy. This can involve:

Merging Tokens: Combining tokens that were incorrectly split (e.g., splitting "don't" into "do" and "n't").

Removing or Replacing Tokens: Removing unnecessary punctuation or normalizing specific characters.

In [None]:
# Process text with spacy
test_text = "Don't stop—keep going!"
test_doc = nlp_spacy(test_text)

# Manually merge tokens if needed
with test_doc.retokenize() as retokenizer:
    for i, token in enumerate(doc):
        if token.text == "n't":
            retokenizer.merge(test_doc[i-1:i+1])

print([token.text for token in test_doc])


Examine features

In [None]:
sentence_number = 0
sentence = doc.sentences[sentence_number]
#sentence = tranche_docs[0][1].sentences[sentence_number]

for word in sentence.words:
    print(f"word:  {word.text}  features:  {word.feats}\n")
print(f"sentence sentiment: {sentence.sentiment}")

In [None]:
lines = book_text.splitlines()
for i, line in enumerate(lines[:10]):
    print(f'line {i}:{line}')

In [None]:
for sentence in doc.sentences[:2]:
   for word_ref, word in enumerate(sentence.words):
       print(f"{word_ref+1}  Word: {word.text}, Lemma: {word.lemma}, POS: {word.pos}, Head: {word.head}, DepRel: {word.deprel}")
   print("\n")
   

In [None]:
max_sentences = 3
for i, sentence in enumerate(doc.sentences[:max_sentences]):
    print(f"Sentence {i + 1}: {sentence.text}")
    for word in sentence.words:
        print(f"  Word: {word.text}, POS: {word.pos}, Lemma: {word.lemma}")
    print()  # Print a blank line between sentences

Saving and loading stanza objects

In [None]:
save_name = "Never let me go stanza json file with invalid chars"
file_path = directory_path + save_name + ".json"
with open(file_path, 'w', encoding='utf-8') as file:
    # Write the opening of the JSON array
    file.write('[\n')
    
    # Process each sentence and write it as a separate JSON object
    for i, sentence in enumerate(doc.sentences):
        sentence_data = {
            "text": sentence.text,
            "words": [{"text": word.text, "pos": word.pos, "lemma": word.lemma} for word in sentence.words]
            }
        
        # Write the JSON object to the file
        json.dump(sentence_data, file, ensure_ascii=False, indent=4)
        
        # If it's not the last sentence, add a comma
        if i < len(doc.sentences) - 1:
            file.write(',\n')
    
    # Write the closing of the JSON array
    file.write('\n]')

In [None]:
# Load the JSON file
save_name = "Never let me go stanza json file with invalid chars"
file_path = directory_path + save_name + ".json"
with open(file_path, 'r', encoding='utf-8') as file:
    sentences_data = json.load(file)

In [None]:
# Manually create a document-like structure from the loaded data
# This is a simplification and not an exact replica of stanza.Document
class SimpleDocument:
    def __init__(self, sentences):
        self.sentences = sentences

class SimpleSentence:
    def __init__(self, text, words):
        self.text = text
        self.words = words

class SimpleWord:
    def __init__(self, text, pos, lemma):
        self.text = text
        self.pos = pos
        self.lemma = lemma

# Reconstruct the sentences
reconstructed_sentences = []
for sentence_data in sentences_data:
    words = [SimpleWord(word['text'], word['pos'], word['lemma']) for word in sentence_data['words']]
    sentence = SimpleSentence(sentence_data['text'], words)
    reconstructed_sentences.append(sentence)

# Create a simple document object
reconstructed_doc = SimpleDocument(reconstructed_sentences)

# Access the reconstructed data
for sentence in reconstructed_doc.sentences:
    print(f"Sentence: {sentence.text}")
    for word in sentence.words:
        print(f"  Word: {word.text}, POS: {word.pos}, Lemma: {word.lemma}")


In [None]:
import csv

In [None]:
save_name = "Never let me go stanza csv file with invalid chars"
file_path = directory_path + save_name + ".csv"
# Save annotations to a CSV file
with open(file_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Sentence", "Word", "POS", "Lemma"])  # Header
    for sentence in doc.sentences:
        for word in sentence.words:
            writer.writerow([sentence.text, word.text, word.pos, word.lemma])

In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path, encoding='utf-8')





In [None]:
print(df.head())


In [None]:
print(df.describe())

In [None]:
print(df[['Word', 'POS']].head())

In [None]:
# Example: Filtering rows where a certain condition is met
filtered_df = df[df['POS'] == 'NOUN']

In [None]:
# Example: Grouping by a column and counting occurrences
grouped_df = df.groupby('POS').count()

In [None]:
# Example: Sorting by a specific column
sorted_df = df.sort_values(by='Lemma')
print(sorted_df.head())

In [None]:
# save modified data frame
df.to_csv('modified_stanza_output.csv', index=False, encoding='utf-8')

In [None]:
save_name = "Never let me go stanza text file with invalid chars"
file_path = directory_path + save_name + ".txt"
with open(file_path, 'w', encoding='utf-8') as file:
    for sentence in doc.sentences:
        file.write(f"Sentence: {sentence.text}\n")
        for word in sentence.words:
            file.write(f"  Word: {word.text}, POS: {word.pos}, Lemma: {word.lemma}\n")
        file.write("\n")  # Blank line between sentences