## Q3: Text Similarity

### Step #1: Importing neccessary packages

In [None]:
import os
import numpy as np
import pandas as pd
import re

### Step #2: Defining the SimilarityCalculator class

In [None]:
class SimilarityCalculator:
    def __init__(self, corpus_directory, search_directory): # O(1)
        """
        Initializes a SimilarityCalculator object.
        Parameters:
        - corpus_directory (str): The directory path of the corpus.
        - search_directory (str): The directory path of the search documents.
        """
        self.corpus_directory = corpus_directory
        self.search_directory = search_directory
        self.unique_words_corpus = {}
        self.unique_words_search = {}
        self.vectors = {}

    def clean_word(self, word): # O(n)
        """
        Cleans a word by removing non-alphabetic characters and converting it to lowercase.
        Parameters:
        - word (str): The word to be cleaned.
        Returns:
        - str: The cleaned word.
        """
        # We will use the following regular expression to remove non-alphabetic characters: [^a-zA-Z]
        return re.sub("[^a-zA-Z]", "", word).lower()

    def get_unique_words(self, target_directory): # O(n*m)
        """
        Loads the file into a dictionary of unique words.
        Parameters:
        - target_directory (str): The directory path of the target documents.
        """
        for file_name in os.listdir(target_directory):
            if file_name.endswith(".txt"):
                unique_words = set()
                try:
                    with open(
                        os.path.join(target_directory, file_name), "r", encoding="utf-8"
                    ) as file:
                        for line in file:
                            for word in line.split():
                                unique_words.add(self.clean_word(word))
                except UnicodeDecodeError:
                    with open(
                        os.path.join(target_directory, file_name),
                        "r",
                        encoding="ISO-8859-1",
                    ) as file:
                        for line in file:
                            for word in line.split():
                                unique_words.add(self.clean_word(word))
                if target_directory == self.search_directory:
                    self.unique_words_search["search"] = unique_words
                else:
                    self.unique_words_corpus[file_name] = unique_words
            else:
                print(
                    f"File {file_name} is not a text file and will therefore not be processed."
                )

    def vectorize_documents(self): # O(n*m)
        """
        Vectorizes the documents based on the unique words.
        """
        # Vectorize corpus
        for file_name, unique_words in self.unique_words_corpus.items():
            vector = [
                1 if word in unique_words else 0
                for word in self.unique_words_search["search"]
            ]
            self.vectors[file_name] = vector

        # Vectorize search
        for file_name, unique_words in self.unique_words_search.items():
            vector = [
                1 if word in unique_words else 0
                for word in self.unique_words_search["search"]
            ]
            self.vectors[file_name] = vector

    def calculate_similarity(self, vec1, vec2, measure): # O(n)
        """
        Calculates the similarity between two vectors.
        Parameters:
        - vec1 (list): The first vector.
        - vec2 (list): The second vector.
        - measure (str): The similarity measure to be used. Possible values: 'dot', 'euclidean', 'cosine'.
        Returns:
        - float: The similarity score.
        """
        if measure == "dot":
            return np.dot(vec1, vec2)
        elif measure == "euclidean":
            return 1 / (1 + np.linalg.norm(np.array(vec1) - np.array(vec2)))
        elif measure == "cosine":
            return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

    def calculate_similarities(self, measure): # O(n)
        """
        Calculates the similarity between the search document and the corpus documents.
        Parameters:
        - measure (str): The similarity measure to be used. Possible values: 'dot', 'euclidean', 'cosine'.
        Returns:
        - dict: A dictionary containing the similarity scores for each corpus document.
        """
        dic_similarity = {}
        for fileName, vector in self.vectors.items():
            if fileName != "search":
                similarity = self.calculate_similarity(
                    self.vectors["search"], vector, measure
                )
                dic_similarity[fileName] = similarity
        return dic_similarity

    def checkPlagiarism(self): # O(n*m)
        """
        Checks if the search document is plagiarized.
        Returns:
        - df: A dataframe containing the similarity scores for each corpus document.
        """
        self.get_unique_words(self.corpus_directory)
        self.get_unique_words(self.search_directory)
        self.vectorize_documents()
        self.results_dot = self.calculate_similarities("dot")
        self.results_euclidean = self.calculate_similarities("euclidean")
        self.results_cosine = self.calculate_similarities("cosine")

        self.results_df = pd.DataFrame(
            {
                "dot": self.results_dot,
                "euclidean": self.results_euclidean,
                "cosine": self.results_cosine,
            }
        )

        # Cosine similarity is seen as the most accurate measure for plagiarism detection. Therefore, the df is sorted by cosine similarity.
        self.results_df = self.results_df.sort_values(by="cosine", ascending=False)

        return f"Similarity of search document at path '{self.search_directory}' with corpus documents at path '{self.corpus_directory}':\n{self.results_df}\nHigher values indicate higher similarity."

    def __str__(self):
        return f"corpusDirectory={self.corpus_directory} \n searchDirectory={self.search_directory} \n {self.results_df}"

### Step 3: Define paths, create class instance and run the similarity check

In [None]:
# If this code is running on a Windows machine, the paths can be set as follows:
# corpus_directory = "..\\question3\\corpusFiles"
# search_directory = "..\\question3\\searchFiles"

# If this code is running on a Mac/Linux machine, the paths can be set as follows:
corpus_directory = "../question3/corpusFiles"
search_directory = "../question3/searchFiles"

test_exam = SimilarityCalculator(corpus_directory, search_directory)
results = test_exam.checkPlagiarism()
print(results)