# Task 2
comparing text to see what text is plagiarised. 
lab consists of 10 text files where 2 text files have plagiarised.

1. Remove the punctuation using a linear replacement and convert the sentences into single lines of
text. Also, convert everything to upper or lower case.
2. Analyse the files to create a word list (a hashed dictionary) replacing the words with numerical
values of the word positions in the list.
3. Linked lists are the ideal mechanism for creating this dictionary
4. Compare the numerical sentence

# FINAL

In [39]:
import os
import string
import itertools
import re
import plotly.graph_objects as go
from nltk.tokenize import sent_tokenize


def find_lcs(sentence1, sentence2):
    """Find the longest common substring between two sentences"""
    words1 = sentence1.split()
    words2 = sentence2.split()
    lcs = ''
    len_lcs = 0
    dp = [[0] * (len(words2) + 1) for _ in range(len(words1) + 1)]

    for k in range(1, len(words1) + 1):
        for l in range(1, len(words2) + 1):
            if words1[k - 1] == words2[l - 1]:
                dp[k][l] = dp[k - 1][l - 1] + 1
                if dp[k][l] > len_lcs:
                    len_lcs = dp[k][l]
                    lcs = ' '.join(words1[k - len_lcs:k])
    return lcs, len_lcs


def plot_heatmap(texts):
    """Create a heatmap of the length of LCS between all pairs of texts"""
    # Initialize the heatmap matrix
    heatmap = [[0] * len(texts) for _ in range(len(texts))]

    # Create/open a file to write the output
    with open('plagiarism_output.txt', 'w') as f:
        # Iterate through all unique pairs of texts to find plagiarized sections
        for text1, text2 in itertools.combinations(texts, 2):
            # Split the texts into sentences
            sentences1 = sent_tokenize(text1[1])
            sentences2 = sent_tokenize(text2[1])
            outputTexts = []
            # Iterate through all pairs of sentences to find matching sentences
            for sentence1 in sentences1:
                for sentence2 in sentences2:
                    # Find the longest common substring between the two sentences
                    lcs, len_lcs = find_lcs(sentence1, sentence2)

                    # Update the heatmap matrix and write plagiarized content to the file
                    if lcs:
                        heatmap[text1[0] - 1][text2[0] - 1] = len_lcs
                        heatmap[text2[0] - 1][text1[0] - 1] = len_lcs
                        f.write(f'Plagiarized content found between {text1[0]:02}.txt and {text2[0]:02}.txt, matching sentence: "{lcs}"\n')

    # Define the x and y axis labels
    labels = [f"{i:02}.txt" for i in range(1, len(texts) + 1)]

    # Define the figure and the heatmap trace
    fig = go.Figure(data=go.Heatmap(z=heatmap, x=labels, y=labels))

    # Add colorbar title and axis labels
    fig.update_layout(coloraxis=dict(colorbar=dict(title="Length of LCS")),
                      xaxis=dict(title="File 1"),
                      yaxis=dict(title="File 2"))

    # Show the plot
    fig.show()




# Define the path to the directory containing the files
path = "Lab3.2"

# Read the contents of each file and store them in texts
texts = []
for i in range(1, 11):
    filepath = os.path.join(path, f"{i:02}.txt")
    with open(filepath, 'r') as f:
        text = f.read().lower().translate(str.maketrans('', '', string.punctuation)).replace('\n', ' ')
    texts.append((i, text))

# Plot the heatmap
plot_heatmap(texts)



## function definitions

In [43]:
import os
import string
import itertools
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import plotly.graph_objects as go
from nltk.tokenize import sent_tokenize


def find_lcs(sentence1, sentence2):
    """Find the longest common substring between two sentences"""
    words1 = sentence1.split()
    words2 = sentence2.split()
    lcs = ''
    len_lcs = 0
    dp = [[0] * (len(words2) + 1) for _ in range(len(words1) + 1)]

    for k in range(1, len(words1) + 1):
        for l in range(1, len(words2) + 1):
            if words1[k - 1] == words2[l - 1]:
                dp[k][l] = dp[k - 1][l - 1] + 1
                if dp[k][l] > len_lcs:
                    len_lcs = dp[k][l]
                    lcs = ' '.join(words1[k - len_lcs:k])
    return lcs, len_lcs


def compute_cosine_similarity(text1, text2):
    """Compute the cosine similarity between two texts"""
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    csim = cosine_similarity(vectors)
    return csim[0, 1]


def plot_heatmap(texts):
    """Create a heatmap of the length of LCS between all pairs of texts"""
    # Initialize the heatmap matrix
    heatmap = [[0] * len(texts) for _ in range(len(texts))]

    # Create/open a file to write the output
    with open('plagiarism_output.txt', 'w') as f:
        # Iterate through all unique pairs of texts to find plagiarized sections
        for text1, text2 in itertools.combinations(texts, 2):
            # Split the texts into sentences
            sentences1 = sent_tokenize(text1[1])
            sentences2 = sent_tokenize(text2[1])
            # Iterate through all pairs of sentences to find matching sentences
            for sentence1 in sentences1:
                for sentence2 in sentences2:
                    # Find the longest common substring between the two sentences
                    lcs, len_lcs = find_lcs(sentence1, sentence2)

                    # Update the heatmap matrix and write plagiarized content to the file
                    if lcs:
                        heatmap[text1[0] - 1][text2[0] - 1] = len_lcs
                        heatmap[text2[0] - 1][text1[0] - 1] = len_lcs
                        f.write(f'Plagiarized content found between {text1[0]:02}.txt and {text2[0]:02}.txt, matching sentence: "{lcs}"\n')

    # Define the x and y axis labels
    labels = [f"{i:02}.txt" for i in range(1, len(texts) + 1)]

    # Define the figure and the heatmap trace
    fig = go.Figure(data=go.Heatmap(z=heatmap, x=labels, y=labels))

    # Add colorbar title and axis labels
    fig.update_layout(coloraxis=dict(colorbar=dict(title="Length of LCS")),
                      xaxis=dict(title="File 1"),
                      yaxis=dict(title="File 2"))

    # Show the plot
    fig.show()


def plot_cosine_similarity_heatmap(texts):
    """Create a heatmap of the cosine similarity between all pairs of texts"""
    # Initialize the heatmap matrix
    heatmap = [[0] * len(texts) for _ in range(len(texts))]

    # Iterate through all unique pairs of texts to compute cosine similarity
    for i in range(len(texts)):
        for j in range(i+1, len(texts)):
            # Compute the cosine similarity
            csim = compute_cosine_similarity(texts[i][1], texts[j][1])

            # Update the heatmap matrix
            heatmap[i][j] = csim
            heatmap[j][i] = csim

    # Define the x and y axis labels
    labels = [f"{i:02}.txt" for i in range(1, len(texts) + 1)]

    # Define the figure and the heatmap trace
    fig = go.Figure(data=go.Heatmap(z=heatmap, x=labels, y=labels, zmin=0, zmax=1, colorscale='Viridis'))

    # Add colorbar title and axis labels
    fig.update_layout(coloraxis=dict(colorbar=dict(title="Cosine Similarity")),
                      xaxis=dict(title="File 1"),
                      yaxis=dict(title="File 2"))
    # Show the plot
    fig.show()

def loadText(path):
    texts = []
    for i in range(1, 11):
        filepath = os.path.join(path, f"{i:02}.txt")
        with open(filepath, 'r') as f:
            text = f.read().lower().translate(str.maketrans('', '', string.punctuation)).replace('\n', ' ')
        texts.append((i, text))
    return texts

In [42]:



# Define the path to the directory containing the files
path = "Lab3.2"

# Read the contents of each file and store them in texts
texts = []

texts = loadText(path)
# Plot the LCS heatmap
plot_heatmap(texts)

# Plot the cosine similarity heatmap
plot_cosine_similarity_heatmap(texts)


NameError: name 'loadText' is not defined