In [5]:
import re
from collections import defaultdict
import numpy as np

# Function to read a text file and preprocess it (lowercase, remove punctuation, etc.)
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()  # Convert to lowercase
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        return text

# Function to generate 3-shingles from the text
def generate_4_shingles(text):
    words = text.split()  # Split text into words
    shingles = []
    seen_shingles = set()  # To track unique shingles

    for i in range(len(words) - 2):  # Create 3-shingles
        shingle = ' '.join(words[i:i + 4])
        if shingle not in seen_shingles:
            shingles.append(shingle)
            seen_shingles.add(shingle)

    return shingles

# Function to create an incidence matrix based on shingles across multiple files
def create_incidence_matrix(file_paths):
    shingle_counts = defaultdict(int)  # Store shingle counts
    all_shingles = []  # List of all shingles in order

    # Step 1: Count shingles and gather all unique shingles in order
    for file_path in file_paths:
        text = read_text_file(file_path)
        shingles = generate_4_shingles(text)

        for shingle in shingles:
            shingle_counts[shingle] += 1
            if shingle not in all_shingles:
                all_shingles.append(shingle)

    num_files = len(file_paths)
    num_shingles = len(all_shingles)

    # Step 2: Create the incidence matrix (rows: shingles, cols: files)
    incidence_matrix = np.zeros((num_shingles, num_files), dtype=int)

    for j, file_path in enumerate(file_paths):
        text = read_text_file(file_path)
        shingles = generate_4_shingles(text)

        for i, shingle in enumerate(all_shingles):
            if shingle in shingles:
                incidence_matrix[i, j] = 1

    return all_shingles, incidence_matrix

# Function to print the comparison between two files
def compare_files(file_paths, comparison_pairs):
    shingles, incidence_matrix = create_incidence_matrix(file_paths)
    
    for pair in comparison_pairs:
        print(f"\nComparison between File {pair[0]} and File {pair[1]}:")
        print("{:<40} {:<10} {:<10}".format("Shingles", f"File {pair[0]}", f"File {pair[1]}"))
        print("-" * 70)

        file1_idx, file2_idx = pair[0] - 1, pair[1] - 1  # Adjust to 0-based indexing
        for idx, shingle in enumerate(shingles):
            if incidence_matrix[idx, file1_idx] == 1 or incidence_matrix[idx, file2_idx] == 1:
                print("{:<40} {:<10} {:<10}".format(
                    shingle,
                    incidence_matrix[idx, file1_idx],
                    incidence_matrix[idx, file2_idx]
                ))

# Example usage with file paths (replace with actual file paths)
file_paths = ["C:/Users/Qureshi/OneDrive/Desktop/txt1.txt", 
              "C:/Users/Qureshi/OneDrive/Desktop/txt2.txt", 
              "C:/Users/Qureshi/OneDrive/Desktop/txt3.txt"]

# List of file pairs to compare: (File 1 and 2), (File 2 and 3), (File 1 and 3)
comparison_pairs = [(1, 2), (2, 3), (1, 3)]

compare_files(file_paths, comparison_pairs)



Comparison between File 1 and File 2:
Shingles                                 File 1     File 2    
----------------------------------------------------------------------
good morning all today                   1          1         
morning all today is                     1          1         
all today is saturday                    1          0         
today is saturday                        1          0         
all today is sunday                      0          1         
today is sunday                          0          1         

Comparison between File 2 and File 3:
Shingles                                 File 2     File 3    
----------------------------------------------------------------------
good morning all today                   1          0         
morning all today is                     1          0         
all today is sunday                      1          0         
today is sunday                          1          0         
good morning all we     