# Import classes and functions

In [1]:
from processing import Processing
from decision import Decision, get_auc

# Initialize the class Processing and train the model which will process all of our data with the following parameters:

In [2]:
doc2vec = Processing(training_directory='../training_data',
                     test_directory='../test_data',
                     document_or_sentences='sentences',
                     lemmatize_or_stemming='lemmatize')

In [3]:
doc2vec.train_model()

# Get the training results which returns a dictionary with the file name as the key and the value as the cosine similarity between the document and the training data. This will be used to evaluate the accuracy of the model.

In [4]:
training_results = doc2vec.get_training_results()

## The classification of the test documents. True is that the text is a plagiarism and False is that the text is not a plagiarism.

In [5]:
validation_dictionary = {
        'FID-01.txt': True,
        'FID-02.txt': True,
        'FID-03.txt': True,
        'FID-04.txt': True,
        'FID-05.txt': True,
        'FID-06.txt': True,
        'FID-07.txt': True,
        'FID-08.txt': True,
        'FID-09.txt': True,
        'FID-10.txt': True,
        'org-002.txt': False,
        'org-005.txt': False,
        'org-012.txt': False,
        'org-020.txt': False,
        'org-026.txt': False,
        'org-040.txt': False,
        'org-047.txt': False,
        'org-056.txt': False,
        'org-059.txt': False,
        'org-061.txt': False,
        'org-065.txt': False,
        'org-071.txt': False,
        'org-081.txt': False,
        'org-084.txt': False,
        'org-090.txt': False,
        'org-093.txt': False,
        'org-100.txt': False,
        'org-001.txt': False,
        'org-110.txt': False
}

### Initialize the Decision class and get the confusion matrix which will be used to calculate the AUC.

In [6]:
decision = Decision()

In [7]:
confusion_matrix = decision.get_confusion_matrix(training_results,
                                                 validation_dictionary)

# Calculate the AUC which is the area under the curve of the ROC curve. The AUC is a measure of how well the model can distinguish between the positive and negative classes.

In [8]:
auc = get_auc(confusion_matrix)
print(auc)

0.95


# Get the most similar document sentences which will return a list of the most similar sentences in the training data.

## Initialize the document model to get the most similar document

In [9]:
doc2vecdocuments = Processing(training_directory='../training_data',
                                  test_directory='../test_data',
                                  document_or_sentences='document',
                                  lemmatize_or_stemming='lemmatize')
doc2vecdocuments.train_model()

In [10]:
top = doc2vecdocuments.get_most_similar_documents('../test_data/FID-01.txt')
lst = doc2vec.get_most_similar_document_sentences(
        '../test_data/FID-01.txt')

### Get the plagiarism sentences which will return a list of the most similar sentences in the training data. As well as the plagiarism percentage for the whole document.

In [11]:
print(decision.get_plagiarism_sentences(lst))

PLAGIARISM DETECTED

Sentence: 'ï»¿This article delves into the intricacies of adaptive fuzzy event-triggered formation tracking control for nonholonomic multirobot systems characterized by infinite actuator faults and range constraints.' || presents plagiarism from  'org-076.txt' sentence 'ï»¿This article delves into the intricacies of adaptive fuzzy event-triggered formation tracking control for nonholonomic multirobot systems characterized by infinite actuator faults and range constraints.'

 Sentence: Traditional cheating detection methods have many disadvantages, such as difficult to detect covert equipment cheating, multi-source cheating, difficult to distinguish plagiarists from plagiarists, difficult to distinguish plagiarists from victims, or plagiarism from coincidences. || does not present plagiarism

 Sentence: 'To address these issues, we leverage the power of fuzzy logic systems (FLSs) and employ adaptive methods to approximate unknown nonlinear functions and uncertain pa