In [42]:
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import pandas as pd
import os
% % time


# Path to the directory containing the text files
corpus_dir = "../../corpus_seneca/"

# Load the text files and create the corpus
corpus = []
file_names = []
for filename in os.listdir(corpus_dir):
    file_path = os.path.join(corpus_dir, filename)
    with open(file_path, "r", encoding="ISO-8859-1") as file:  # Specify the encoding as ISO-8859-1
        # Preprocess, tokenize, and add the text to the corpus
        lines = file.readlines()
        # Add further preprocessing if required
        corpus.extend(lines)
        file_names.extend([filename] * len(lines))

# Train the Word2Vec model on the corpus
model = Word2Vec(sentences=corpus, vector_size=100,
                 window=5, min_count=1, workers=4)

# Path to the disputed text file
disputed_text_path = "../../corpus_seneca/sen_oct.txt"

# Load the disputed text and preprocess it
with open(disputed_text_path, "r", encoding="ISO-8859-1") as file:  # Specify the encoding as ISO-8859-1
    disputed_text = file.readlines()
    # Preprocess, tokenize, and further process the disputed text if needed

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=["Disputed_File", "Genuine_File", "Disputed_Line_Number",
                          "Genuine_Line_Number", "Disputed_Line", "Genuine_Line", "Similarity"])

# Iterate through each line in the disputed text
for disputed_line_number, disputed_line in enumerate(disputed_text):
    # Preprocess, tokenize, and further process the line if needed
    # Tokenize and lowercase the disputed line
    disputed_words = disputed_line.strip().lower().split()

    # Compute the word embeddings for the disputed line
    disputed_line_embeddings = []
    for word in disputed_words:
        if word in model.wv:
            word_embedding = model.wv[word]
            disputed_line_embeddings.append(word_embedding)

    # Compute the average embedding for the disputed line
    disputed_line_embedding = sum(disputed_line_embeddings) / len(
        disputed_line_embeddings) if disputed_line_embeddings else None

    # Compare the disputed line's embedding with each line in the corpus
    for corpus_line_number, corpus_line in enumerate(corpus):
        # Preprocess, tokenize, and further process the corpus line if needed
        # Tokenize and lowercase the corpus line
        corpus_words = corpus_line.strip().lower().split()

        # Compute the word embeddings for the corpus line
        corpus_line_embeddings = []
        for word in corpus_words:
            if word in model.wv:
                word_embedding = model.wv[word]
                corpus_line_embeddings.append(word_embedding)

        # Compute the average embedding for the corpus line
        corpus_line_embedding = sum(
            corpus_line_embeddings) / len(corpus_line_embeddings) if corpus_line_embeddings else None

        # Compute the cosine similarity between the disputed line and the corpus line
        if disputed_line_embedding is not None and corpus_line_embedding is not None:
            similarity = cosine_similarity([disputed_line_embedding], [
                                           corpus_line_embedding])[0][0]
            if similarity >= 0.9:
                genuine_file = file_names[corpus_line_number]
                genuine_line_number = corpus_line_number + 1

                # Add the similar line to the results DataFrame
                new_row = pd.DataFrame({
                    "Disputed_File": disputed_text_path,
                    "Genuine_File": genuine_file,
                    "Disputed_Line_Number": disputed_line_number + 1,
                    "Genuine_Line_Number": genuine_line_number,
                    "Disputed_Line": disputed_line.strip(),
                    "Genuine_Line": corpus_line.strip(),
                    "Similarity": similarity
                }, index=[0])
                results_df = pd.concat(
                    [results_df, new_row], ignore_index=True)

# Print the results DataFrame
results_df.to_csv("../lines-similarity/similar_lines_seneca.csv",
                  sep=",", encoding="utf-8", index=False)

CPU times: user 1min 11s, sys: 139 ms, total: 1min 11s
Wall time: 1min 10s


In [43]:
import pandas as pd

data = pd.read_csv("../lines-similarity/similar_lines_seneca.csv")
data

Unnamed: 0,Disputed_File,Genuine_File,Disputed_Line_Number,Genuine_Line_Number,Disputed_Line,Genuine_Line,Similarity
0,../../corpus_seneca/sen_oct.txt,sen_ag.txt,18,57,"o lux semper funesta mihi,",O regnorum magnis fallax,1.0
1,../../corpus_seneca/sen_oct.txt,sen_ag.txt,18,169,"o lux semper funesta mihi,","o scelera semper sceleribus vincens domus,",1.0
2,../../corpus_seneca/sen_oct.txt,sen_ag.txt,18,311,"o lux semper funesta mihi,","Canite, o pubes inclita, Phoebum!",1.0
3,../../corpus_seneca/sen_oct.txt,sen_ag.txt,18,349,"o lux semper funesta mihi,","ades, o magni, soror et coniunx,",1.0
4,../../corpus_seneca/sen_oct.txt,sen_ag.txt,18,369,"o lux semper funesta mihi,","tuque, o magni gnata Tonantis,",1.0
...,...,...,...,...,...,...,...
1188,../../corpus_seneca/sen_oct.txt,sen_tro.txt,1014,10885,hanc quoque tristi procul a poena,Troadum Non rude vulgus lacrimisque a ovum,1.0
1189,../../corpus_seneca/sen_oct.txt,sen_tro.txt,1014,11135,hanc quoque tristi procul a poena,a caede nostra regia cessat manus,1.0
1190,../../corpus_seneca/sen_oct.txt,sen_tro.txt,1014,11363,hanc quoque tristi procul a poena,"semper tenebit, semper a tergo timor",1.0
1191,../../corpus_seneca/sen_oct.txt,sen_tro.txt,1014,11864,hanc quoque tristi procul a poena,semper a semper dolor est malignus:,1.0
