<a href="https://colab.research.google.com/github/Sreejan09/NLP_German/blob/main/Ques_7_DeDuplication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import hashlib
from datasketch import MinHash
from simhash import Simhash

In [None]:
# Path to folder with text files
input_folder = "input_articles"
output_folder = "filtered_articles"

In [None]:
# Create a folder for filtered articles
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [None]:
# Helper function to compute SimHash
def compute_simhash(text):
    return Simhash(text).value

# Helper function to compute MinHash
def compute_minhash(text, num_perm=128):
    m = MinHash(num_perm=num_perm)
    for word in text.split():
        m.update(word.encode('utf8'))
    return m

# Function to check similarity between hashes
def is_similar(hash1, hash2, threshold=0.8):
    return hash1.similarity(hash2) >= threshold

In [None]:

# Load and deduplicate articles
def deduplicate_articles(input_folder, output_folder, method="simhash", similarity_threshold=0.8):
    seen_hashes = set()
    file_count = 0

    for filename in os.listdir(input_folder):
        file_path = os.path.join(input_folder, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

            if method == "simhash":
                current_hash = compute_simhash(text)
            elif method == "minhash":
                current_hash = compute_minhash(text)
            else:
                raise ValueError("Method should be either 'simhash' or 'minhash'.")

            # Check similarity against previously seen hashes
            is_duplicate = False
            for prev_hash in seen_hashes:
                if method == "simhash":
                    # Simhash similarity (bitwise Hamming distance)
                    if Simhash(text).distance(Simhash(prev_hash)) < (1 - similarity_threshold) * 64:
                        is_duplicate = True
                        break
                elif method == "minhash":
                    # Minhash similarity
                    if is_similar(current_hash, prev_hash, similarity_threshold):
                        is_duplicate = True
                        break

            if not is_duplicate:
                seen_hashes.add(text)  # Store original text for Simhash/Minhash comparison
                file_count += 1
                # Save the non-duplicate file to the new folder
                with open(os.path.join(output_folder, filename), 'w', encoding='utf-8') as output_file:
                    output_file.write(text)

    print(f"Deduplication complete! {file_count} unique articles saved to {output_folder}")



In [None]:

# Run the deduplication process
deduplicate_articles(input_folder, output_folder, method="simhash", similarity_threshold=0.8)