In [16]:
import os
import re
import string
from itertools import combinations
import nltk
from cltk.stem.lemma import LemmaReplacer
from cltk.corpus.utils.importer import CorpusImporter
from nltk.util import ngrams
from glob import glob
import csv
import pandas as pd


corpus_importer = CorpusImporter('latin')
corpus_importer.import_corpus('latin_models_cltk')

lemmatizer = LemmaReplacer('latin')

# function to compute jaccard similarity
# jaccard similarity is defined by the number of observations in both sets
# divided by the number of observation in either of the sets


def jaccard_similarity(set1, set2):
    # check if both the provided arguments are sets
    # if not convert them into sets
    if not isinstance(set1, set):
        set1 = set(set1)
    if not isinstance(set2, set):
        set2 = set(set2)
    if len(set1 | set2) == 0:
        return 0.0  # avoid raising the ZeroDivisionError
    # round the final results to two digits
    # in other words, by doing so we return a 0.0 similarity instead of 0
    return round(len(set1 & set2) / len(set1 | set2), 2)


# set the directory path
dir_path = "../../corpus_imposters/sen_*.txt"

# create a dictionary to store the lines of each poem
senecan_plays = {}

# iterate through all the text files in the directory
for filename in glob(dir_path):
    if not os.path.basename(filename) == "sen_her_o.txt":
        with open(filename, "r") as inp:
            # split the file into lines
            lines = [re.sub(r'[^\w\s]', '', line.lower())
                     for line in inp.read().splitlines()]
            # store the lines in the dictionary
            senecan_plays[os.path.basename(filename)] = lines

# extract the lines of the cento poem
oct_lines = senecan_plays["sen_oct.txt"]

# create a list to store the similar lines
similar_lines = []

# iterate through the lines of the cento poem
for i, oct_line in enumerate(oct_lines):
    # Create a set of bi-grams for each Octavia line
    octavia_set = set(ngrams(lemmatizer.lemmatize(oct_line), 2))
    # iterate through the lines of the other poems
    for play_name, play_line in senecan_plays.items():
        if play_name == "sen_oct.txt":  # if Octavia
            continue  # skip
        for n, play_line in enumerate(play_line):
            # create a set of bi-grams for the other poem line
            play_set = set(ngrams(lemmatizer.lemmatize(play_line), 2))
            # calculate the Jaccard similarity
            similarity = jaccard_similarity(octavia_set, play_set)
            # store the similar line information
            if similarity > 0.3:
                similar_lines.append(
                    (i+1, oct_line, play_name, n+1, play_line, similarity))

# Write the similar lines to a text file
with open("similar_lines.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Poem", "Octavia Line Number", "Octavia Line", "Similar Poem",
                    "Similar Poem Line Number", "Similar Poem Line", "Similarity"])
    for line_info in similar_lines:
        writer.writerow(["Octavia", line_info[0], line_info[1],
                        line_info[2], line_info[3], line_info[4], line_info[5]])


similar_lines = pd.read_csv("similar_lines.csv")
similar_lines.columns = ["cento", "line_num_cento", "line_cento",
                         "play", "line_num_play", "line_play", "jaccard_simil_score"]
similar_lines

Unnamed: 0,cento,line_num_cento,line_cento,play,line_num_play,line_play,jaccard_simil_score
0,Octavia,58,fortuna licet,sen_thy.txt,983,redeat felix fortuna licet,0.33
1,Octavia,61,flere parentem,sen_tro.txt,811,rumpe iam fletus parens,0.33
2,Octavia,242,regit bootes frigore arctoo rigens,sen_med.txt,714,taurus cohercet frigore arctoo rigens,0.33
3,Octavia,309,dira libido,sen_phaed.txt,1006,vincit sanctos dira libido,0.33
4,Octavia,383,hic est hic est fodiendus ait,sen_tro.txt,731,hic est hic est terror vlixe,0.33
5,Octavia,760,tandem quietem magnus excussit timor,sen_oed.txt,248,curam perempti maior excussit timor,0.33
