In [None]:
%%time

import os
import csv
from cltk.data.fetch import FetchCorpus
from cltk.sentence.lat import LatinPunktSentenceTokenizer 
from Levenshtein import distance as levenshtein_distance
# from cltk.tokenizers import LatinWordTokenizer
from cltk.corpora.lat.phi import file_utils

# import the latin model in order for sentence tokenizer to be allowed
corpus_downloader = FetchCorpus(language="lat")
corpus_downloader.import_corpus("lat_models_cltk")

# initialize CLTK sentence tokenizer for Latin
sentence_tokenizer = LatinPunktSentenceTokenizer(strict=True)

# corpus of texts
directory_path = '../../corpora/corpus_imposters/'

# directory to write the results if it does not exist
results_directory = os.path.join('..', 'lines-similarity', 'results_line_sim')
os.makedirs(results_directory, exist_ok=True)

# disputed texts (O, HO)
disputed_texts = ['sen_oct.txt', 'sen_her_o.txt']

# dictionary to save sentences for each play
play_sentences = {}

# function to read files
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# function to tokenize sentences
def tokenize_sentences(text):
    return sentence_tokenizer.tokenize(text)

# function to compare sentences using Levenshtein similarity
def compare_sentences(play1, play2, threshold=.6):
    similar_lines = []

    for i, sentence1 in enumerate(play1):
        for j, sentence2 in enumerate(play2):
            # Compare sentences using Levenshtein similarity or another method
            # You can replace this with your preferred similarity measure
            similarity = levenshtein_similarity(sentence1, sentence2)
            
            if similarity >= threshold:
                similar_lines.append((i, j, similarity, sentence1, sentence2))

    return similar_lines

def levenshtein_similarity(str1, str2):
    len_str1 = len(str1)
    len_str2 = len(str2)
    distance = levenshtein_distance(str1, str2)

    return 1 - (distance / max(len_str1, len_str2))

# process each file in the directory
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)

    # check if the file is a Senecan play and not the disputed
    if filename.startswith("sen_") and filename not in disputed_texts:
        # clean filename to make the result more readable
        clean_filename = os.path.splitext(os.path.basename(filename))[0]
        # read the file
        text = read_file(file_path)
        # split into sentences
        sentences = tokenize_sentences(text)
        # save the results into the dictionary
        play_sentences[clean_filename] = sentences

# process disputed plays (follow the same steps as for the originals)
for disputed_text in disputed_texts:
    # clean filename to make results more readable
    disputed_text_name = os.path.splitext(os.path.basename(disputed_text))[0]
    disputed_path = os.path.join(directory_path, disputed_text)
    # read the contents of the file for each disputed text
    disputed_text_content = read_file(disputed_path)
    # split into sentences (for the disputed texts)
    disputed_sentences = tokenize_sentences(disputed_text_content)
    
    # compare sentences with each Senecan play
    for senecan_play, senecan_sentences in play_sentences.items():
        similar_lines = compare_sentences(disputed_sentences, senecan_sentences)

        # Save results to CSV with unique name for each combination
        clean_senecan_play = os.path.splitext(os.path.basename(senecan_play))[0]
        csv_filename = os.path.join(results_directory, f'similarity_{disputed_text_name}_vs_{clean_senecan_play}_res.csv')
        with open(csv_filename, 'w', encoding='utf-8', newline='') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerow(['Disputed Sentence Index', 'Senecan Sentence Index', 'Similarity', 'Disputed Sentence', 'Senecan Sentence'])

            for i, j, similarity, disputed_sentence, senecan_sentence in similar_lines:
                csv_writer.writerow([i, j, similarity, disputed_sentence, senecan_sentence])