<a href="https://colab.research.google.com/github/XanimGuliyeva/Document_Similarity_Checker/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasketch
import datasketch

In [None]:
import os
import gdown
import numpy as np
import tarfile
import nltk
from nltk.tokenize import word_tokenize
from datasketch import MinHash, MinHashLSH


# Google Drive file ID and output file
file_id = '1AIwNIs40Ix3kpEXzzVJqmdaT5Twktfl7'
output = '20_newsgroups.tar.gz'

# Download the tar.gz file
gdown.download(f'https://drive.google.com/uc?id={file_id}', output, quiet=False)

# Extract the tar.gz file
with tarfile.open(output, 'r:gz') as tar:
    tar.extractall('20_newsgroups')

# # Function to load documents from local directories
# def load_documents_from_directory(directory_path):
#     documents = []
#     for root, dirs, files in os.walk(directory_path):
#         for file in files:
#             if file.endswith('.txt'):
#                 file_path = os.path.join(root, file)
#                 with open(file_path, 'r', encoding='utf-8') as f:
#                     documents.append(f.read())
#     return documents


In [None]:
# Path to the extracted dataset directory
dataset_path = '20_newsgroups/20_newsgroups'

# Function to list all files in the dataset directory
def list_files_in_directory(directory_path):
    file_list = []
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_list.append(file_path)
    return file_list

# List all files in the dataset directory
file_list = list_files_in_directory(dataset_path)
print(f"Total number of files: {len(file_list)}")

In [None]:
from nltk.corpus import stopwords

nltk.download('stopwords')

import nltk
nltk.download('punkt')

#token.isalnum(): Checks if the token consists only of alphanumeric characters (letters and digits).
# Function to tokenize text and ignore stop words
def tokenize(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words and token.isalnum()]
    return filtered_tokens

# Function to generate shingles
def get_shingles(tokens, k=9):
    shingles = set()
    for i in range(len(tokens) - k + 1):
        shingle = ' '.join(tokens[i:i+k])
        shingles.add(shingle)
    return shingles

# Function to compute Minhash signature
def compute_minhash(shingles, num_hashes=100):
    m = MinHash(num_perm=num_hashes)
    for shingle in shingles:
        m.update(shingle.encode('utf8'))
    return m


In [None]:

# Initialize list to hold Minhash signatures
minhash_signatures = []
shingled_documents = []

# Set the size of each shingle and number of hash functions
shingle_size = 9
num_hashes = 100

# Function to extract the document ID from the file path
def extract_doc_id(file_path):
    return os.path.splitext(os.path.basename(file_path))[0]

# Process each file, tokenize it, create shingles, and compute Minhash
for index, file_path in enumerate(file_list):
    with open(file_path, 'r', encoding='latin-1') as file:
        content = file.read()
        tokens = tokenize(content)
        shingles = get_shingles(tokens, shingle_size)
        shingled_documents.append(shingles)
        minhash = compute_minhash(shingles, num_hashes)
        minhash_signatures.append(minhash)
        doc_id = extract_doc_id(file_path)
        if index < 10:
            # Print example shingles and Minhash signatures for the first 10 documents
            print(f"Document {doc_id} tokens: {tokens}")
            print(f"Document {doc_id} shingles: {shingles}")
            print(f"Document {doc_id} Minhash signature: {minhash.hashvalues}")


In [None]:
# Function to create LSH index and find candidate pairs with file paths
def create_lsh(minhash_signatures, file_list, threshold=0.7, num_hashes=100):
    lsh = MinHashLSH(threshold=threshold, num_perm=num_hashes)

    # Insert Minhash signatures into LSH
    for i, m in enumerate(minhash_signatures):
        lsh.insert(f"doc_{i}", m)

    # Find candidate pairs
    candidate_pairs = []
    for i, m in enumerate(minhash_signatures):
        result = lsh.query(m)
        filtered_result = [r for r in result if r != f"doc_{i}"]
        candidate_pairs.append((f"doc_{i}", filtered_result))

    # Map document IDs to file paths
    doc_id_to_path = {f"doc_{i}": file_list[i] for i in range(len(file_list))}

    return candidate_pairs, doc_id_to_path



# Assuming `minhash_signatures` and `file_list` are already defined and populated
candidate_pairs, doc_id_to_path = create_lsh(minhash_signatures, file_list, threshold=0.7, num_hashes=100)

# Print the candidate pairs with their names and paths
for doc, pairs in candidate_pairs:
    if pairs:  # Check if there are any candidate pairs
        doc_path = doc_id_to_path[doc]
        doc_name = os.path.basename(doc_path)
        paired_docs = [(doc_id_to_path[pair], os.path.basename(doc_id_to_path[pair])) for pair in pairs]
        # print(f"Document {doc_name} (path: {doc_path}) has candidate pairs: {paired_docs}")
        print(f"Document {doc_name} has candidate pairs: {paired_docs}")

In [None]:
# Function to compute Jaccard similarity
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

# Function to compute similar pairs based on Jaccard similarity
def compute_similar_pairs(candidate_pairs, shingled_documents, doc_id_to_path, similarity_threshold=0.5):
    similar_pairs = []
    for doc_id, candidates in candidate_pairs:
        doc_index = int(doc_id.split('_')[1])
        for candidate_id in candidates:
            candidate_index = int(candidate_id.split('_')[1])
            #not to compare with itself
            if doc_index != candidate_index:
                similarity = jaccard_similarity(shingled_documents[doc_index], shingled_documents[candidate_index])
                if similarity > similarity_threshold:
                    doc_name = os.path.basename(doc_id_to_path[doc_id])
                    candidate_name = os.path.basename(doc_id_to_path[candidate_id])
                    similar_pairs.append((doc_name, candidate_name, similarity))
    return similar_pairs

# Compute similar pairs based on Jaccard similarity
similar_pairs = compute_similar_pairs(candidate_pairs, shingled_documents, doc_id_to_path, similarity_threshold=0.5)

# Print the similar pairs with their names
for doc_name, candidate_name, similarity in similar_pairs:
    print(f"Documents {doc_name} and {candidate_name} are similar with Jaccard similarity {similarity:.2f}")