# Cosine Similarity (tf-idf vectorizer)

In [1]:
# %%time

# import os
# import csv
# from sklearn.feature_extraction.text import TfidfVectorizer
# from cltk.data.fetch import FetchCorpus
# from cltk.sentence.lat import LatinPunktSentenceTokenizer
# from sklearn.metrics.pairwise import cosine_similarity

# # import the Latin model for the sentence tokenizer
# corpus_downloader = FetchCorpus(language='lat')
# corpus_downloader.import_corpus('lat_models_cltk')
# corpus_downloader.import_corpus('latin_training_set_sentence_cltk')

# # initialize CLTK sentence tokenizer for Latin
# sentence_tokenizer = LatinPunktSentenceTokenizer(strict=True)

# # corpus of texts
# directory_path = '../../corpora/corpus_imposters/'

# # directory to write the results if it does not exist
# results_directory = os.path.join('..', 'lines-similarity', 'results_line_sim_cosine')
# os.makedirs(results_directory, exist_ok=True)

# # disputed texts (O, HO)
# disputed_texts = ['sen_oct.txt', 'sen_her_o.txt']

# # dictionary to save sentences for each play
# play_sentences = {}

# # function to read files
# def read_file(file_path):
#     with open(file_path, 'r', encoding='utf-8') as file:
#         return file.read()

# # function to tokenize sentences
# def tokenize_sentences(text):
#     return sentence_tokenizer.tokenize(text)

# # function to compare sentences using cosine similarity
# def compare_sentences(play1, play2, threshold=0.7):
#     # convert sentences to vectors using TfidfVectorizer
#     vectorizer = TfidfVectorizer()
#     vectors = vectorizer.fit_transform(play1 + play2)
    
#     # compute cosine similarity
#     similarities = cosine_similarity(vectors[:len(play1)], vectors[len(play1):])

#     # find similar sentences
#     similar_lines = []
#     for i in range(len(play1)):
#         for j in range(len(play2)):
#             if similarities[i, j] >= threshold:
#                 similar_lines.append((i, j, similarities[i, j], play1[i], play2[j]))

#     return similar_lines

# # process each file in the directory
# for filename in os.listdir(directory_path):
#     file_path = os.path.join(directory_path, filename)

#     # check if the file is a Senecan play and not the disputed
#     if filename.startswith("sen_") and filename not in disputed_texts:
#         # clean filename to make the result more readable
#         clean_filename = os.path.splitext(os.path.basename(filename))[0]
#         # read the file
#         text = read_file(file_path)
#         # split into sentences
#         sentences = tokenize_sentences(text)
#         # save the results into the dictionary
#         play_sentences[clean_filename] = sentences

# # process disputed plays (follow the same steps as for the originals)
# for disputed_text in disputed_texts:
#     # clean filename to make results more readable
#     disputed_text_name = os.path.splitext(os.path.basename(disputed_text))[0]
#     disputed_path = os.path.join(directory_path, disputed_text)
#     # read the contents of the file for each disputed text
#     disputed_text_content = read_file(disputed_path)
#     # split into sentences (for the disputed texts)
#     disputed_sentences = tokenize_sentences(disputed_text_content)

#     # compare sentences with each Senecan play using cosine similarity
#     for senecan_play, senecan_sentences in play_sentences.items():
#         similar_lines = compare_sentences(disputed_sentences, senecan_sentences)

#         # save results to CSV with a unique name for each combination
#         clean_senecan_play = os.path.splitext(os.path.basename(senecan_play))[0]
#         csv_filename = os.path.join(results_directory, f'similarity_{disputed_text_name}_vs_{clean_senecan_play}_res.csv')
#         with open(csv_filename, 'w', encoding='utf-8', newline='') as csvfile:
#             csv_writer = csv.writer(csvfile)
#             csv_writer.writerow(['Disputed Sentence Index', 'Senecan Sentence Index', 'Similarity', 'Disputed Sentence', 'Senecan Sentence'])

#             for i, j, similarity, disputed_sentence, senecan_sentence in similar_lines:
#                 csv_writer.writerow([i, j, similarity, disputed_sentence, senecan_sentence])
                
#         # remove similar sentences from disputed text
#         for i, _, _, disputed_sentence, _ in similar_lines:
#             disputed_sentences[i] = ""  # replace similar sentences with an empty string

#     # write modified disputed text to a new file
#     modified_disputed_path = os.path.join(results_directory, f'modified_{disputed_text_name}.txt')
#     with open(modified_disputed_path, 'w', encoding='utf-8') as modified_file:
#         modified_file.write('\n'.join([sentence for sentence in disputed_sentences if sentence]))

#     print(f"Modified disputed text saved to: {modified_disputed_path}")

Modified disputed text saved to: ../lines-similarity/results_line_sim_cosine/modified_sen_oct.txt
Modified disputed text saved to: ../lines-similarity/results_line_sim_cosine/modified_sen_her_o.txt
CPU times: user 3.87 s, sys: 815 ms, total: 4.69 s
Wall time: 11.9 s


In [2]:
%%time

import os
import csv
import random
from cltk.data.fetch import FetchCorpus
from cltk.sentence.lat import LatinPunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Import the Latin model for the sentence tokenizer
corpus_downloader = FetchCorpus(language='lat')
corpus_downloader.import_corpus('lat_models_cltk')
corpus_downloader.import_corpus('latin_training_set_sentence_cltk')

# Initialize CLTK sentence tokenizer for Latin
sentence_tokenizer = LatinPunktSentenceTokenizer(strict=True)

# Corpus of texts
directory_path = '../../corpora/corpus_imposters/'

# Directory to write the results if it does not exist
results_directory = os.path.join('..', 'lines-similarity', 'results_line_sim_cosine')
os.makedirs(results_directory, exist_ok=True)

# Disputed texts (O, HO)
disputed_texts = ['sen_oct.txt', 'sen_her_o.txt']

# Dictionary to save sentences for each play
play_sentences = {}

# Dictionary to store the count of removed sentences for each disputed play
removed_sentence_count = {}

# Function to read files
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Function to tokenize sentences
def tokenize_sentences(text):
    return sentence_tokenizer.tokenize(text)

# Function to compare sentences using cosine similarity
def compare_sentences(play1, play2, threshold=.6):
    # Convert sentences to vectors using TfidfVectorizer
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(play1 + play2)
    
    # Compute cosine similarity
    similarities = cosine_similarity(vectors[:len(play1)], vectors[len(play1):])

    # Find similar sentences
    similar_lines = []
    for i in range(len(play1)):
        for j in range(len(play2)):
            if similarities[i, j] >= threshold:
                similar_lines.append((i, j, similarities[i, j], play1[i], play2[j]))

    return similar_lines

# Process each file in the directory
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)

    # Check if the file is a Senecan play and not the disputed
    if filename.startswith("sen_") and filename not in disputed_texts:
        # Clean filename to make the result more readable
        clean_filename = os.path.splitext(os.path.basename(filename))[0]
        # Read the file
        text = read_file(file_path)
        # Split into sentences
        sentences = tokenize_sentences(text)
        # Save the results into the dictionary
        play_sentences[clean_filename] = sentences

# Process disputed plays (follow the same steps as for the originals)
for disputed_text in disputed_texts:
    # Clean filename to make results more readable
    disputed_text_name = os.path.splitext(os.path.basename(disputed_text))[0]
    disputed_path = os.path.join(directory_path, disputed_text)
    # Read the contents of the file for each disputed text
    disputed_text_content = read_file(disputed_path)
    # Split into sentences (for the disputed texts)
    disputed_sentences = tokenize_sentences(disputed_text_content)

    # Compare sentences with each Senecan play using cosine similarity
    for senecan_play, senecan_sentences in play_sentences.items():
        similar_lines = compare_sentences(disputed_sentences, senecan_sentences)

        # Save results to CSV with a unique name for each combination
        clean_senecan_play = os.path.splitext(os.path.basename(senecan_play))[0]
        csv_filename = os.path.join(results_directory, f'similarity_{disputed_text_name}_vs_{clean_senecan_play}_res.csv')
        with open(csv_filename, 'w', encoding='utf-8', newline='') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerow(['Disputed Sentence Index', 'Senecan Sentence Index', 'Similarity', 'Disputed Sentence', 'Senecan Sentence'])

            for i, j, similarity, disputed_sentence, senecan_sentence in similar_lines:
                csv_writer.writerow([i, j, similarity, disputed_sentence, senecan_sentence])

        # Print 5 random examples along with their similarity threshold
        print(f"Examples for {disputed_text_name} vs {clean_senecan_play}:")
        random_samples = random.sample(similar_lines, min(5, len(similar_lines)))
        for i, j, similarity, disputed_sentence, senecan_sentence in random_samples:
            print(f"Similarity: {similarity:.4f}")
            print(f"Disputed Sentence: {disputed_sentence}")
            print(f"Senecan Sentence: {senecan_sentence}")

        # Count and store the number of removed sentences
        removed_sentence_count[(disputed_text_name, clean_senecan_play)] = len(similar_lines)

        # Remove similar sentences from disputed text
        for i, _, _, _, _ in similar_lines:
            disputed_sentences[i] = ""  # Replace similar sentences with an empty string

    # Calculate and print the percentage of removed sentences
    total_sentences = len(disputed_sentences)
    removed_sentences = sum(removed_sentence_count.values())
    percentage_removed = (removed_sentences / total_sentences) * 100
    print(f"Total Sentences in {disputed_text_name}: {total_sentences}")
    print(f"Removed Sentences: {removed_sentences}")
    print(f"Percentage Removed: {percentage_removed:.2f}%")

    # Write modified disputed text to a new file
    modified_disputed_path = os.path.join(results_directory, f'modified_{disputed_text_name}.txt')
    with open(modified_disputed_path, 'w', encoding='utf-8') as modified_file:
        modified_file.write('\n'.join([sentence for sentence in disputed_sentences if sentence]))

    print(f"Modified disputed text saved to: {modified_disputed_path}")

Examples for sen_oct vs sen_ag:
Examples for sen_oct vs sen_thy:
Examples for sen_oct vs sen_her_f:
Examples for sen_oct vs sen_phaed:
Similarity: 0.7387
Disputed Sentence: cur genae fletu madent?
Senecan Sentence: Et si odia servas, cur madent fletu genae?
Examples for sen_oct vs sen_phoen:
Similarity: 0.8404
Disputed Sentence: Et hoc sat est?
Senecan Sentence: nec hoc sat est:
Examples for sen_oct vs sen_oed:
Similarity: 0.6917
Disputed Sentence: Parere dubitas?
Senecan Sentence: dubitas?
Similarity: 0.6917
Disputed Sentence: Parere dubitas?
Senecan Sentence: dubitas?
Examples for sen_oct vs sen_med:
Examples for sen_oct vs sen_tro:
Total Sentences in sen_oct: 422
Removed Sentences: 4
Percentage Removed: 0.95%
Modified disputed text saved to: ../lines-similarity/results_line_sim_cosine/modified_sen_oct.txt
Examples for sen_her_o vs sen_ag:
Similarity: 0.7724
Disputed Sentence: peractum est.
Senecan Sentence: habet, peractum est.
Similarity: 0.6051
Disputed Sentence: Habet, peractum e