In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def text_similarity(text1, text2):
    """
    Compute the cosine similarity between two texts.
    
    Args:
    text1 (str): The first text.
    text2 (str): The second text.
    
    Returns:
    float: The cosine similarity between the two texts.
    """
    # Create a CountVectorizer instance
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    
    # Calculate cosine similarity
    similarity = cosine_similarity(vectorizer)
    
    # Since there are only 2 texts, similarity[0, 1] or similarity[1, 0] gives the similarity
    return similarity[0, 1]

# Example usage:
text1 = "This is the first text."
text2 = "This is the second text."
similarity_score = text_similarity(text1, text2)
print("Cosine Similarity between the texts:", similarity_score)


Cosine Similarity between the texts: 0.7999999999999999


In [19]:
import os
import zipfile
import shutil

def unzip_file(zip_file, output_folder):
    """
    Unzip a zip file to the specified output folder.
    
    Args:
    zip_file (str): Path to the zip file.
    output_folder (str): Path to the output folder where the contents will be extracted.
    
    Returns:
    str: Path to the folder containing the extracted files.
    """
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Empty the output folder if it already exists
    if os.path.exists(output_folder):
        shutil.rmtree(output_folder)
    
    # Extract the zip file
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(output_folder)
    
    return output_folder


    # Get input zip file path
zip_file = input("Enter the path to the zip file: ").strip()
    
# Check if the zip file exists
if not os.path.exists(zip_file):
    print("Error: The specified zip file does not exist.")
    

# Get the base name of the zip file (without extension)
base_name = os.path.splitext(os.path.basename(zip_file))[0]

# Output folder path where the contents will be extracted
output_folder = os.path.join(os.path.dirname(zip_file), base_name)

# Unzip the file
unzipped_folder = unzip_file(zip_file, '/Users/rishit/Documents/innovate_you/plagiarism_detection/plagiarism_rishit/zip_outputs')

print(f"The zip file has been successfully extracted to: {unzipped_folder}")




The zip file has been successfully extracted to: /Users/rishit/Documents/innovate_you/plagiarism_detection/plagiarism_rishit/zip_outputs


In [21]:
import os
import glob
import PyPDF2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def read_pdf(file_path):
    """
    Read text content from a PDF file.
    
    Args:
    file_path (str): Path to the PDF file.
    
    Returns:
    str: Text content of the PDF.
    """
    text = ""
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

def text_similarity(text1, text2):
    """
    Compute the cosine similarity between two texts.
    
    Args:
    text1 (str): The first text.
    text2 (str): The second text.
    
    Returns:
    float: The cosine similarity between the two texts.
    """
    # Create a CountVectorizer instance
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    
    # Calculate cosine similarity
    similarity = cosine_similarity(vectorizer)
    
    # Since there are only 2 texts, similarity[0, 1] or similarity[1, 0] gives the similarity
    return similarity[0, 1]

def main(folder_path):
    # Get all PDF files in the folder
    pdf_files = glob.glob(os.path.join(folder_path, "*.pdf"))
    num_files = len(pdf_files)
    
    print(f"Found {num_files} PDF files in the folder.")
    
    if num_files == 0:
        print("No PDF files found in the specified folder.")
        return
    
    # Compute similarity scores for all pairs of texts
    for i in range(num_files):
        for j in range(i+1, num_files):
            file1_name = os.path.basename(pdf_files[i])
            file2_name = os.path.basename(pdf_files[j])
            print(f"Comparing files '{file1_name}' and '{file2_name}'")
            text1 = read_pdf(pdf_files[i])
            text2 = read_pdf(pdf_files[j])
            similarity_score = text_similarity(text1, text2)
            print(f"Similarity between '{file1_name}' and '{file2_name}': {similarity_score}")
            if similarity_score > 0.9:
                print(f"Complete plagiarism detected between '{file1_name}' and '{file2_name}'!")
            elif similarity_score > 0.75:
                print(f"potential plagiarism detected between '{file1_name}' and '{file2_name}'!")
            else:
                print(' work done well ')


# Example usage:
folder_path = unzipped_folder + '/pdfs'
main(folder_path)


Found 3 PDF files in the folder.
Comparing files 'assignment1.pdf' and 'Assignment 2.pdf'
Similarity between 'assignment1.pdf' and 'Assignment 2.pdf': 0.0
 work done well 
Comparing files 'assignment1.pdf' and 'Assignment 2 copy.pdf'
Similarity between 'assignment1.pdf' and 'Assignment 2 copy.pdf': 0.0
 work done well 
Comparing files 'Assignment 2.pdf' and 'Assignment 2 copy.pdf'
Similarity between 'Assignment 2.pdf' and 'Assignment 2 copy.pdf': 0.9999999999999986
Complete plagiarism detected between 'Assignment 2.pdf' and 'Assignment 2 copy.pdf'!
