In [2]:
%pip install lxml


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
from lxml import etree
from pathlib import Path

In [4]:
files = list(Path("tlg0012").glob("./**/*perseus-eng*.xml"))

for f in files:
    print(f)

tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml
tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml
tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml
tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml
tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml


In [5]:
TEI_NS = "http://www.tei-c.org/ns/1.0"
XML_NS = "http://www.w3.org/XM:/1998/namespace"

NAMESPACES = {
    "tei": TEI_NS,
    "xml": XML_NS,
}

In [6]:
for file in files:
    # print the name of the file as a sanity check
    print(file)
    
    # etree.parse() reads the file and turns the raw XML into an object that we can use in Python
    tree = etree.parse(file)

    # xpath() is a method that applies **xpath expressions** to search through the XML.
    # This xpath expression says, "Find any `tei:div` element with a `subtype` of `'card'`.
    # Under that element, get any text." The second argument, `namespaces=`, tells the
    # method to use the supplied namespaces as shortcuts, so we don't have to type out
    # "http://www.tei-c.org/ns/1.0" every time we want an element in the TEI namespace.

    text = tree.xpath(f"//tei:div[@subtype='card']//text()", namespaces=NAMESPACES)

    # xpath() returns an array of matches, so we initialize an empty array to store the
    # results. We could use a list comprehension, but for now rewriting these
    # lines as a list comprehension is left as an exercise for the reader.
    cleaned_text = []

    # Now we iterate through each string returned by `xpath()`
    for t in text:
        # `strip()` removes leading and trailing whitespace; if all that's left is an empty
        # string, we don't care about it.
        if t.strip() != "":
            cleaned_text.append(t.strip())

    # We make sure that we actually *have* text before writing just the text, without
    # TEI elements, to a separate file. No need to write an empty file, right?
    if len(cleaned_text) > 0:
        # A lot is happening here:
        #
        # 1. `str(file)` turns the `Path` object into a `str`
        # 2. `split("/")` splits the resulting string at every "/"
        # 3. `[-1]` takes the last element of the list returned by `split("/")`
        # 4. `replace(".xml", ".txt")` changes the extension of the file
        # 
        # So something like "xml/tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml"
        # is transformed into "tlg0012.tlg001.perseus-eng3.txt".
        with open(str(file).split("/")[-1].replace(".xml", ".txt"), "w+") as f:
            # We write the text to the file, `join`-ing each element in
            # `cleaned_text` with a newline ("\n")
            f.write("\n".join(cleaned_text))
    

tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml


tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml
tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml
tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml
tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml


In [7]:
%pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (796 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m796.9/796.9 kB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading click-8.1.8-py3-none-any.whl (98 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, click, nltk
[0mSuccessfully installed click-8.1.8 nltk-3.9.1 re

In [8]:
import nltk

# download the files needed for tokenization
# the punkt tokenizer should be installed already,
# but let's download it just in case
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [9]:
# Initialize the tokenizer
from nltk.tokenize import word_tokenize

# Initialize an empty dictionary to store the tokenized texts
tokenized_texts = {}

# Get a Path.glob() iterator for the .txt files that you've created in this directory.
# Can you figure out what the new `[1-4]` segment is doing?
text_files = Path(".").glob("tlg0012.tlg00*.perseus-eng[1-4].txt")

# Iterate through the text files, reading and tokenizing them one by one,
# then storing the list of tokens in our `tokenized_texts` dictionary —
# so we'll be getting a dictionary of lists.
for file in text_files:
    name = str(file)

    with open(file) as f:
        # Notice we're lowercasing the text. You don't *have*
        # to do this, but it helps eliminate some noise for
        # our purposes.
        text = f.read().lower()
        tokens = word_tokenize(text)

        # Let's just print the length of the tokens list to make
        # sure we're getting sane results. We'll use string interpolation
        # to identify which text we're working with.
        print(f"There are {len(tokens)} tokens in {name}.")

        # Store each file's `tokens` list in the `tokenized_texts`
        # dictionary, using the filename as the key.
        tokenized_texts[name] = tokens


There are 0 tokens in tlg0012.tlg003.perseus-eng1.txt.
There are 200630 tokens in tlg0012.tlg001.perseus-eng3.txt.
There are 135463 tokens in tlg0012.tlg002.perseus-eng4.txt.
There are 175611 tokens in tlg0012.tlg001.perseus-eng4.txt.
There are 152631 tokens in tlg0012.tlg002.perseus-eng3.txt.


In [10]:
from collections import Counter

# Using our `tokenized_texts` dictionary, we'll iterate
# through each key-value pair — remember, the keys are
# filenames and the values are lists of tokens.
# We'll get a count of the tokens by passing the list to
# `Counter`, then we'll change the value for that key to
# a dictionary with its own keys, `tokens` and `counts`.

for filename, tokens in tokenized_texts.items():
    counts = Counter(tokens)

    tokenized_texts[filename] = {"tokens": tokens, "counts": counts}

In [11]:
tokenized_texts["tlg0012.tlg001.perseus-eng3.txt"]["counts"]["odysseus"]

128

In [12]:
df_achilles = 0
df_odysseus = 0

# Calculate the DF for "odysseus" and "achilles".
# We iterate through the dictionary, and then simply
# count the number of files in which we find each term.
# For these two terms, we should probably expect DFs of 4.
for filename, values in tokenized_texts.items():
    if "odysseus" in values['counts']:
        df_odysseus += 1
    
    if "achilles" in values["counts"]:
        df_achilles += 1

# Now we'll import the log function to calculate the IDF for each term.
from math import log10

n_docs = len(tokenized_texts.keys())

idf_achilles = log10(n_docs / df_achilles)
idf_odysseus = log10(n_docs / df_odysseus)

print(idf_achilles)

0.09691001300805642


In [13]:
# Now let's calculate the TF-IDF "score" for each term in each document.

# Once again, iterate through the dictionary.
for filename, values in tokenized_texts.items():
    # Get the total number of terms in each file — we'll
    # use this to calculate the relative frequency as our
    # TF.
    total_terms = len(values['tokens'])

    # Get the TF for each term in this file.
    tf_achilles = values['counts']['achilles'] / total_terms
    tf_odysseus = values['counts']['odysseus'] / total_terms

    # Remember, the simplest version of TF-IDF is just
    # TF * 1/DF
    tf_idf_achilles = tf_achilles * idf_achilles
    tf_idf_odysseus = tf_odysseus * idf_odysseus

    # Now we can report on the statistics for this file
    print(f"""In {filename}:
TF of achilles: {tf_achilles}
TF of odysseus: {tf_odysseus}
TF-IDF of achilles: {tf_idf_achilles}
TF-IDF of odysseus: {tf_idf_odysseus}
""")

ZeroDivisionError: division by zero

What happened to cause the error?

The logarithmic function hiding under this division is what is causing the denominator of the division function to be 0. Since throughout all documents these names will appear, you cannot use it as a document and term frequency function part. 

To improve this analysis, I would chose a less obviously frequent term to count.

In [14]:
my_list = [1, 1, 2, 3, 3]

set(my_list)

{1, 2, 3}

In [15]:
non_universal_terms = {}

for filename, values in tokenized_texts.items():
    my_set = set(values['counts'].keys())

    for other_file, other_values in tokenized_texts.items():
        # make sure we don't compare the file
        # to itself, otherwise the difference
        # will be the empty set
        if other_file != filename:
            my_set -= set(other_values['counts'].keys())
    
    # now push the remaining set of terms to the dictionary
    non_universal_terms[filename] = my_set

# log `non_universal_terms` as a sanity check
non_universal_terms

{'tlg0012.tlg003.perseus-eng1.txt': set(),
 'tlg0012.tlg001.perseus-eng3.txt': {'pigmy',
  'wide-flowing',
  'daetor',
  'loud-voiced',
  'minishing',
  'thaleia',
  'placus',
  'peace—yet',
  'feareth',
  'plain—even',
  '—himself',
  'divisions—in',
  'gazeth',
  '215.1',
  'vexeth',
  'speweth',
  'vapour',
  'sore-pressed',
  '565.2',
  'ialmenus',
  'archeptolemus',
  'guideth',
  '121.1',
  'swine-herd',
  'quaffest',
  'fierce-rushing',
  'over-weening',
  'sinketh',
  'fluttered',
  'long-edged',
  'exultest',
  'battlegear',
  'enwrapping',
  'consolation',
  'favouring',
  'accompany',
  "whoe'er",
  'daysman',
  'them—even',
  'sore-wearied',
  'frankness',
  'honour',
  'vineland',
  'swift-glancing',
  'bronze-harnessed',
  'division',
  'mountainside',
  '181.1',
  'term',
  'waging',
  'lysander',
  '273.1',
  'archelochus',
  'eyeballs',
  'ungirt',
  'rending',
  'dancing-floor',
  'ascalaphus',
  'swallows',
  'coveteth',
  'shield-men',
  'trailed',
  '—of',
  'bronz

HW Section

Selected term to explore:
wise



In [16]:
adjective = {"wise"}

for filename, values in tokenized_texts.items():
    my_set1 = set(values['counts'].keys())

    for other_file, other_values in tokenized_texts.items():
        # make sure we don't compare the file
        # to itself, otherwise the difference
        # will be the empty set
        if other_file != filename:
            my_set1 -= set(other_values['counts'].keys())
    
    # now push the remaining set of terms to the dictionary
    adjective[filename] = my_set1

# log `adjective` as a sanity check
adjective

TypeError: 'set' object does not support item assignment

Potential error: the item being assigned to that set was not previously a part of that defined set?

How I might improve the analysis in the future:
- Work towards chosing a set of items that best suit my entire dataset. I briefly skimmed one of the texts and chose "wise" but it may be that this would cause even more errors in the future if it's way too common. 

What TF-IDF is really telling us about a given term in each text and within the corpus as a whole:
In each text, TF-IDF helps us pinpoint a specific term's counted frequency in relationship to the entire document's usage of the term. Within the corpus of the whole, the term is weighted against its overall frequency as a helpful tool bypassing word order in ways that may or may not be fully conducive to your research question. 

Other objects of study for which TF-IDF might be useful:
TF-IDF does not consider word order so this could be helpful for a massive dataset of terms not needing analysis related to their related collocates. Also, training chatbots could use this method to quickly collect and analyze data. 