In [None]:
!pip install python-docx PyPDF2

In [22]:
import heapq
import os
import collections
from docx import Document  # For reading DOCX files
import PyPDF2  # For reading PDF files

# Node class for Huffman Tree
class Node:
    def __init__(self, char, freq):
        self.char = char
        self.freq = freq
        self.left = None
        self.right = None

    def __lt__(self, other):
        return self.freq < other.freq

# Function to build Huffman Tree
def build_huffman_tree(frequency):
    heap = [Node(char, freq) for char, freq in frequency.items()]
    heapq.heapify(heap)
    
    while len(heap) > 1:
        left = heapq.heappop(heap)
        right = heapq.heappop(heap)
        merged = Node(None, left.freq + right.freq)
        merged.left = left
        merged.right = right
        heapq.heappush(heap, merged)
        
    return heap[0]

# Function to generate Huffman Codes
def generate_huffman_codes(root):
    codes = {}
    def _generate_codes(node, current_code):
        if node is None:
            return
        if node.char is not None:
            codes[node.char] = current_code
        _generate_codes(node.left, current_code + "0")
        _generate_codes(node.right, current_code + "1")
        
    _generate_codes(root, "")
    return codes

# Function to compress content using Huffman Coding
def compress(content):
    frequency = collections.Counter(content)
    huffman_tree_root = build_huffman_tree(frequency)
    huffman_codes = generate_huffman_codes(huffman_tree_root)
    
    # Encode the content with Huffman codes
    encoded_content = "".join(huffman_codes[char] for char in content)
    
    # Calculate the size of the original and compressed data
    original_size = len(content) * 8  # 1 character = 8 bits
    compressed_size = len(encoded_content)  # Compressed size in bits
    
    return encoded_content, huffman_codes, original_size, compressed_size

# Function to calculate compression ratio
def calculate_compression_ratio(original_size, compressed_size):
    return compressed_size / original_size

# Function to read TXT, HTML, DOCX, and PDF files
def read_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    
    if ext in ['.txt', '.html']:  # Read as text files
        encodings = ['utf-8', 'ISO-8859-1', 'windows-1252']
        for enc in encodings:
            try:
                with open(file_path, 'r', encoding=enc) as file:
                    return file.read()
            except UnicodeDecodeError:
                continue
        raise UnicodeDecodeError(f"Unable to decode file {file_path} with the available encodings.")
    
    elif ext == '.docx':  # Read DOCX files using python-docx
        doc = Document(file_path)
        full_text = []
        for para in doc.paragraphs:
            full_text.append(para.text)
        return '\n'.join(full_text)
    
    elif ext == '.pdf':  # Read PDF files using PyPDF2
        pdf_text = ""
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                pdf_text += page.extract_text()
        return pdf_text
    
    else:
        raise ValueError(f"Unsupported file format: {ext}")

# Main function to compress file and compute compression ratio
def huffman_compress_file(file_path):
    try:
        content = read_file(file_path)
        compressed_data, codes, original_size, compressed_size = compress(content)
        compression_ratio = calculate_compression_ratio(original_size, compressed_size)

        # Save compressed data as binary string to a file
        compressed_file_path = file_path + '.huffman'
        with open(compressed_file_path, 'w') as f:
            f.write(compressed_data)

        print(f"Original size: {original_size} bits")
        print(f"Compressed size: {compressed_size} bits")
        print(f"Compression ratio: {compression_ratio:.2f}")

        return compressed_file_path, compression_ratio

    except FileNotFoundError as fnf_error:
        print(f"Error: {fnf_error}")
    except ValueError as ve:
        print(f"Error: {ve}")
    except Exception as e:
        print(f"Unexpected error: {str(e)}")
# Example usage
file_path = 'C:/Users/Vithika SURVE/Downloads/Prayer_programme_5c final one copy (2).html'  # Replace with your actual file path
compressed_file, ratio = huffman_compress_file(file_path)


Original size: 594600 bits
Compressed size: 394279 bits
Compression ratio: 0.66


In [18]:
import heapq
from collections import defaultdict
import PyPDF2
import docx
from bs4 import BeautifulSoup

# Node class for Huffman tree
class Node:
    def _init_(self, char, freq):
        self.char = char
        self.freq = freq
        self.left = None
        self.right = None

    def _lt_(self, other):
        return self.freq < other.freq

# Huffman coding function
def huffman_coding(data):
    # Check for empty data
    if not data:
        raise ValueError("Error: No data provided for Huffman coding.")

    # Calculate frequency of each character
    freq = defaultdict(int)
    for char in data:
        freq[char] += 1

    # Build a priority queue
    priority_queue = [Node(char, freq[char]) for char in freq]
    heapq.heapify(priority_queue)

    # Build the Huffman Tree
    while len(priority_queue) > 1:
        left = heapq.heappop(priority_queue)
        right = heapq.heappop(priority_queue)
        merged = Node(None, left.freq + right.freq)
        merged.left = left
        merged.right = right
        heapq.heappush(priority_queue, merged)

    # Generate Huffman codes
    codes = {}
    def generate_codes(node, current_code):
        if node:
            if node.char is not None:
                codes[node.char] = current_code
            generate_codes(node.left, current_code + "0")
            generate_codes(node.right, current_code + "1")

    generate_codes(priority_queue[0], "")
    return codes

# Function to read PDF files
def read_pdf(file_path):
    text = ""
    try:
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                extracted_text = page.extract_text()
                if extracted_text:
                    text += extracted_text + "\n"
        if not text.strip():
            raise ValueError("Error: The PDF file is empty or contains no text.")
    except FileNotFoundError:
        raise FileNotFoundError(f"Error: File not found: {file_path}")
    except PyPDF2.PdfReadError:
        raise ValueError(f"Error: Unable to read PDF file: {file_path}")
    return text

# Function to read DOC/DOCX files
def read_doc(file_path):
    text = ""
    try:
        doc = docx.Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
        if not text.strip():
            raise ValueError("Error: The DOC file is empty or contains no text.")
    except FileNotFoundError:
        raise FileNotFoundError(f"Error: File not found: {file_path}")
    except docx.opc.exceptions.PackageNotFoundError:
        raise ValueError(f"Error: The DOC file is corrupted or unreadable: {file_path}")
    return text

# Function to read TXT files
def read_txt(file_path):
    text = ""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
        if not text.strip():
            raise ValueError("Error: The TXT file is empty.")
    except FileNotFoundError:
        raise FileNotFoundError(f"Error: File not found: {file_path}")
    except UnicodeDecodeError:
        raise ValueError(f"Error: Unable to decode TXT file due to invalid encoding: {file_path}")
    return text

# Function to read HTML files
def read_html(file_path):
    text = ""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            soup = BeautifulSoup(file, 'html.parser')
            text = soup.get_text()
        if not text.strip():
            raise ValueError("Error: The HTML file is empty or contains no text.")
    except FileNotFoundError:
        raise FileNotFoundError(f"Error: File not found: {file_path}")
    return text

# Function to read file based on its extension
def read_file(file_path):
    if not file_path:
        raise ValueError("Error: File path is empty.")
    if file_path.endswith('.pdf'):
        return read_pdf(file_path)
    elif file_path.endswith('.doc') or file_path.endswith('.docx'):
        return read_doc(file_path)
    elif file_path.endswith('.txt'):
        return read_txt(file_path)
    elif file_path.endswith('.html'):
        return read_html(file_path)
    else:
        raise ValueError(f"Error: Unsupported file format: {file_path}")

# Example usage
file_path='C:/Users/Vithika SURVE/OneDrive/Desktop/PYTHON PROJECTS/VITHIKA PYTHON/Prayer_programme_5c final one.pdf'  # Change to your file path

try:
    file_text = read_file(file_path)

    if file_text:  # Proceed only if text was extracted
        print("Extracted text:", file_text[:100])  # Print first 100 characters

        huffman_codes = huffman_coding(file_text)
        print("Huffman Codes:", huffman_codes)

        # Calculate original and compressed sizes
        original_size = len(file_text) * 8  # in bits
        compressed_size = sum(len(code) * file_text.count(char) for char, code in huffman_codes.items())

        print(f"Original Size (in bits): {original_size}")
        print(f"Compressed Size (in bits): {compressed_size}")

        # Avoid division by zero
        if compressed_size > 0:
            compression_ratio = original_size / compressed_size
        else:
            compression_ratio = 0

        print(f"Compression Ratio: {compression_ratio:.2f}")

except (FileNotFoundError, ValueError) as e:
    print(f"Terminating program: {e}")
except Exception as e:
    print(f"Terminating program due to an unexpected error: {e}")

Extracted text: Prayer programme  –(Paras  to be excluded as he wont climb)  
Please ask Prisha  if she is permitted
Terminating program due to an unexpected error: Node() takes no arguments


In [5]:
import heapq
from bs4 import BeautifulSoup
import docx
import PyPDF2

def huffman_encoding(text):
    """Encodes a text string using Huffman coding.

    Args:
        text: The input text string.

    Returns:
        A tuple containing the encoded text, the Huffman tree, and the compression ratio.
    """

    # Count symbol frequencies
    frequencies = {}
    for char in text:
        frequencies[char] = frequencies.get(char, 0) + 1

    # Create a priority queue of symbols and their frequencies
    heap = [(freq, char) for char, freq in frequencies.items()]
    heapq.heapify(heap)

    # Build the Huffman tree
    while len(heap) > 1:
        freq1, char1 = heapq.heappop(heap)
        freq2, char2 = heapq.heappop(heap)
        heapq.heappush(heap, (freq1 + freq2, (char1, char2)))

    # Assign codewords to symbols
    codewords = {}
    def assign_codewords(node, code):
        if isinstance(node, str):
            codewords[node] = code
        else:
            assign_codewords(node[0], code + '0')
            assign_codewords(node[1], code + '1')

    assign_codewords(heap[0][1], '')

    # Encode the text
    encoded_text = ''.join(codewords[char] for char in text)

    # Calculate compression ratio
    original_size = len(text) * 8  # Assuming 8 bits per character
    encoded_size = sum(len(codewords[char]) for char in text)
    compression_ratio = encoded_size / original_size

    return encoded_text, heap[0][1], compression_ratio

def huffman_decoding(encoded_text, huffman_tree):
    """Decodes an encoded text string using a Huffman tree.

    Args:
        encoded_text: The encoded text string.
        huffman_tree: The Huffman tree used for encoding.

    Returns:
        The decoded text string.
    """

    decoded_text = ''
    current_node = huffman_tree
    for bit in encoded_text:
        if bit == '0':
            current_node = current_node[0]
        else:
            current_node = current_node[1]
        if isinstance(current_node, str):
            decoded_text += current_node
            current_node = huffman_tree
    return decoded_text

# Example usage with HTML, DOC, and PDF
def extract_text_from_html(html_file):
    with open(html_file, 'r') as f:
        soup = BeautifulSoup(f, 'html.parser')
        text = soup.get_text()
    return text

def extract_text_from_doc(doc_file):
    doc = docx.Document(doc_file)
    text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
    return text

def extract_text_from_pdf(pdf_file):
    with open(pdf_file, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

# Example usage with different file formats
file_path = "C:/Users/Vithika SURVE/Downloads/Prayer_programme_5c final one copy (2).html"  # Replace with the actual file path
if file_path.endswith(".html"):
    text = extract_text_from_html(file_path)
elif file_path.endswith(".doc"):
    text = extract_text_from_doc(file_path)
elif file_path.endswith(".pdf"):
    text = extract_text_from_pdf(file_path)
else:
    print("Unsupported file format.")

encoded_text, huffman_tree, compression_ratio = huffman_encoding(text)
decoded_text = huffman_decoding(encoded_text, huffman_tree)

print("Original text:", text)
print("Encoded text:", encoded_text)
print("Decoded text:", decoded_text)
print("Compression ratio:", compression_ratio)

TypeError: '<' not supported between instances of 'tuple' and 'str'

In [2]:
!pip install beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
Downloading soupsieve-2.6-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.12.3 soupsieve-2.6


In [34]:
import heapq
from collections import defaultdict
import PyPDF2
import docx
from bs4 import BeautifulSoup

# Node class for Huffman tree
class Node:
    def __init__(self, char, freq):  # Corrected constructor with double underscores
        self.char = char
        self.freq = freq
        self.left = None
        self.right = None

    def __lt__(self, other):  # Corrected comparison method
        return self.freq < other.freq

# Huffman coding function
def huffman_coding(data):
    # Check for empty data
    if not data:
        raise ValueError("Error: No data provided for Huffman coding.")

    # Calculate frequency of each character
    freq = defaultdict(int)
    for char in data:
        freq[char] += 1

    # Build a priority queue
    priority_queue = [Node(char, freq[char]) for char in freq]
    heapq.heapify(priority_queue)

    # Build the Huffman Tree
    while len(priority_queue) > 1:
        left = heapq.heappop(priority_queue)
        right = heapq.heappop(priority_queue)
        merged = Node(None, left.freq + right.freq)
        merged.left = left
        merged.right = right
        heapq.heappush(priority_queue, merged)

    # Generate Huffman codes
    codes = {}
    def generate_codes(node, current_code):
        if node:
            if node.char is not None:
                codes[node.char] = current_code
            generate_codes(node.left, current_code + "0")
            generate_codes(node.right, current_code + "1")

    generate_codes(priority_queue[0], "")
    return codes

# Function to read PDF files
def read_pdf(file_path):
    text = ""
    try:
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                extracted_text = page.extract_text()
                if extracted_text:
                    text += extracted_text + "\n"
        if not text.strip():
            raise ValueError("Error: The PDF file is empty or contains no text.")
    except FileNotFoundError:
        raise FileNotFoundError(f"Error: File not found: {file_path}")
    except PyPDF2.PdfReadError:
        raise ValueError(f"Error: Unable to read PDF file: {file_path}")
    return text

# Function to read DOC/DOCX files
def read_doc(file_path):
    text = ""
    try:
        doc = docx.Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
        if not text.strip():
            raise ValueError("Error: The DOC file is empty or contains no text.")
    except FileNotFoundError:
        raise FileNotFoundError(f"Error: File not found: {file_path}")
    except docx.opc.exceptions.PackageNotFoundError:
        raise ValueError(f"Error: The DOC file is corrupted or unreadable: {file_path}")
    return text

# Function to read TXT files
def read_txt(file_path):
    text = ""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
        if not text.strip():
            raise ValueError("Error: The TXT file is empty.")
    except FileNotFoundError:
        raise FileNotFoundError(f"Error: File not found: {file_path}")
    except UnicodeDecodeError:
        raise ValueError(f"Error: Unable to decode TXT file due to invalid encoding: {file_path}")
    return text

# Function to read HTML files
def read_html(file_path):
    text = ""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            soup = BeautifulSoup(file, 'html.parser')
            text = soup.get_text()
        if not text.strip():
            raise ValueError("Error: The HTML file is empty or contains no text.")
    except FileNotFoundError:
        raise FileNotFoundError(f"Error: File not found: {file_path}")
    return text

# Function to read file based on its extension
def read_file(file_path):
    if not file_path:
        raise ValueError("Error: File path is empty.")
    if file_path.endswith('.pdf'):
        return read_pdf(file_path)
    elif file_path.endswith('.doc') or file_path.endswith('.docx'):
        return read_doc(file_path)
    elif file_path.endswith('.txt'):
        return read_txt(file_path)
    elif file_path.endswith('.html'):
        return read_html(file_path)
    else:
        raise ValueError(f"Error: Unsupported file format: {file_path}")

# Example usage
file_path = 'C:/Users/Vithika SURVE/OneDrive/Desktop/PYTHON PROJECTS/VITHIKA PYTHON/Prayer_programme_5c final one.dox'  # Change to your file path

try:
    file_text = read_file(file_path)

    if file_text:  # Proceed only if text was extracted
        print("Extracted text:", file_text[:100])  # Print first 100 characters

        huffman_codes = huffman_coding(file_text)
        print("Huffman Codes:", huffman_codes)

        # Calculate original and compressed sizes
        original_size = len(file_text) * 8  # in bits
        compressed_size = sum(len(code) * file_text.count(char) for char, code in huffman_codes.items())

        print(f"Original Size (in bits): {original_size}")
        print(f"Compressed Size (in bits): {compressed_size}")

        # Avoid division by zero
        if compressed_size > 0:
            compression_ratio = original_size / compressed_size
        else:
            compression_ratio = 0

        print(f"Compression Ratio: {compression_ratio:.2f}")

except (FileNotFoundError, ValueError) as e:
    print(f"Terminating program: {e}")
except Exception as e:
    print(f"Terminating program due to an unexpected error: {e}")


Terminating program: Error: Unsupported file format: C:/Users/Vithika SURVE/OneDrive/Desktop/PYTHON PROJECTS/VITHIKA PYTHON/Prayer_programme_5c final one.dox
