<a href="https://colab.research.google.com/github/Preksha-Dadoo/Plagiarism-Checker/blob/main/Plagiarism_Checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import pandas as pd
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer #CONVERTS TEXT DATA TO NUMERIACLA DATA
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
def read_file(filename):
    """Reads the content of a file and returns it as a string."""
    with open(filename, 'r', encoding='utf-8') as file:
        return file.read()

In [27]:
def check_plagiarism(files, output_csv="plagiarism_results.csv"):
    """Compares plagiarism between every pair of files and saves the result in a CSV file."""
    file_contents = [read_file(file) for file in files]  # Read all file contents

    vectorizer = TfidfVectorizer()  # Initialize TF-IDF vectorizer
    vectors = vectorizer.fit_transform(file_contents)  # Convert text into numerical vectors

    similarity_matrix = cosine_similarity(vectors)  # Compute cosine similarity

    # Store results in a list of dictionaries
    results = []
    for (i, j) in combinations(range(len(files)), 2):
        similarity_score = similarity_matrix[i, j] * 100  # Convert to percentage
        print(f"Similarity between {files[i]} and {files[j]}: {similarity_score:.2f}%")

        # Append results to the list
        results.append({"File 1": files[i], "File 2": files[j], "Similarity (%)": similarity_score})

    # Save results to CSV
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"Results saved to {output_csv}")

In [28]:
# List of files to compare
files = ["/content/Arthur.txt", "/content/Ben.txt", "/content/Clark.txt"]

# Run plagiarism check
check_plagiarism(files)

Similarity between /content/Arthur.txt and /content/Ben.txt: 45.95%
Similarity between /content/Arthur.txt and /content/Clark.txt: 54.30%
Similarity between /content/Ben.txt and /content/Clark.txt: 40.89%
Results saved to plagiarism_results.csv
