In [1]:
pip install python-magic

Collecting python-magic
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)
Installing collected packages: python-magic
Successfully installed python-magic-0.4.27
Note: you may need to restart the kernel to use updated packages.


In [None]:
import hashlib
import os
import mimetypes
from difflib import SequenceMatcher
from pathlib import Path
from typing import Tuple, Dict, Optional
import time

class FileComparator:
    def __init__(self):
        self.supported_hash_algorithms = {
            'sha1': hashlib.sha1,
            'sha256': hashlib.sha256,
            'sha512': hashlib.sha512,
            'md5': hashlib.md5  # included for compatibility, not recommended for security
        }
        # Initialize mimetypes
        mimetypes.init()

    def get_file_info(self, filepath: str) -> Dict:
        """Get detailed information about a file."""
        path = Path(filepath)
        file_stats = path.stat()
        
        # Guess file type using mimetypes
        mime_type, _ = mimetypes.guess_type(filepath)
        if mime_type is None:
            mime_type = 'application/octet-stream'
            
        # Try to determine if it's a text file
        is_text = False
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                f.read(1024)
                is_text = True
        except UnicodeDecodeError:
            pass
            
        return {
            'size': file_stats.st_size,
            'created': time.ctime(file_stats.st_ctime),
            'modified': time.ctime(file_stats.st_mtime),
            'mime_type': mime_type,
            'is_text': is_text,
            'extension': path.suffix.lower()
        }

    def calculate_hash(self, filepath: str, algorithm: str = 'sha256', chunk_size: int = 8192) -> str:
        """Calculate file hash using specified algorithm."""
        if algorithm not in self.supported_hash_algorithms:
            raise ValueError(f"Unsupported hash algorithm. Use one of: {list(self.supported_hash_algorithms.keys())}")

        hash_obj = self.supported_hash_algorithms[algorithm]()

        try:
            with open(filepath, "rb") as file:
                while True:
                    chunk = file.read(chunk_size)
                    if not chunk:
                        break
                    hash_obj.update(chunk)
            return hash_obj.hexdigest()
        except IOError as e:
            raise IOError(f"Error reading file {filepath}: {str(e)}")

    def calculate_similarity(self, file1: str, file2: str) -> float:
        """Calculate text similarity between two files using SequenceMatcher."""
        try:
            with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
                text1 = f1.read()
                text2 = f2.read()
                return SequenceMatcher(None, text1, text2).ratio()
        except UnicodeDecodeError:
            return -1  # Return -1 for binary files

    def get_file_preview(self, filepath: str, preview_size: int = 1024) -> str:
        """Get a preview of the file content if it's a text file."""
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read(preview_size)
                return content if len(content) < preview_size else content + "..."
        except UnicodeDecodeError:
            return "[Binary content]"

    def compare_files(self, file1: str, file2: str, algorithm: str = 'sha256') -> Dict:
        """
        Comprehensive file comparison returning detailed results.
        """
        results = {
            'file1_info': self.get_file_info(file1),
            'file2_info': self.get_file_info(file2),
            'hash_algorithm': algorithm
        }

        # Calculate hashes
        try:
            hash1 = self.calculate_hash(file1, algorithm)
            hash2 = self.calculate_hash(file2, algorithm)
            results['hash1'] = hash1
            results['hash2'] = hash2
            results['identical'] = hash1 == hash2
        except IOError as e:
            results['error'] = str(e)
            return results

        # Calculate size difference
        results['size_difference'] = abs(results['file1_info']['size'] - results['file2_info']['size'])

        # Calculate similarity for text files
        if results['file1_info']['is_text'] and results['file2_info']['is_text']:
            results['similarity_ratio'] = self.calculate_similarity(file1, file2)
            # Add preview for text files
            results['file1_preview'] = self.get_file_preview(file1)
            results['file2_preview'] = self.get_file_preview(file2)
        else:
            results['similarity_ratio'] = None
            results['file1_preview'] = "[Binary content]"
            results['file2_preview'] = "[Binary content]"

        return results

def print_comparison_report(results: Dict):
    """Print a formatted comparison report."""
    print("\n=== File Comparison Report ===")
    print(f"\nFile 1: {results['file1_info']['mime_type']}")
    print(f"Size: {results['file1_info']['size']} bytes")
    print(f"Last modified: {results['file1_info']['modified']}")
    print(f"Extension: {results['file1_info']['extension']}")
    
    print(f"\nFile 2: {results['file2_info']['mime_type']}")
    print(f"Size: {results['file2_info']['size']} bytes")
    print(f"Last modified: {results['file2_info']['modified']}")
    print(f"Extension: {results['file2_info']['extension']}")
    
    print(f"\nHash Algorithm: {results['hash_algorithm']}")
    print(f"File 1 Hash: {results['hash1']}")
    print(f"File 2 Hash: {results['hash2']}")
    
    print(f"\nResults:")
    print(f"Files are {'identical' if results['identical'] else 'different'}")
    print(f"Size difference: {results['size_difference']} bytes")
    
    if results['similarity_ratio'] is not None:
        print(f"Text similarity: {results['similarity_ratio']*100:.2f}%")
        print("\nFile 1 Preview:")
        print(results['file1_preview'])
        print("\nFile 2 Preview:")
        print(results['file2_preview'])

def main():
    # Example usage
    comparator = FileComparator()
    
    # Get file paths from user
    file1 = input("Enter path to first file: ")
    file2 = input("Enter path to second file: ")
    
    # Get hash algorithm preference
    print("\nAvailable hash algorithms:", ", ".join(comparator.supported_hash_algorithms.keys()))
    algorithm = input("Enter hash algorithm (default: sha256): ").lower() or 'sha256'
    
    try:
        results = comparator.compare_files(file1, file2, algorithm)
        print_comparison_report(results)
    except Exception as e:
        print(f"Error: {str(e)}")

if __name__ == "__main__":
    main()

In [None]:
comparator = FileComparator()
results = comparator.compare_files("file1.txt", "file2.txt")
print_comparison_report(results)