In [None]:
import hashlib
from collections import Counter
from google.colab import files
from IPython.display import display, HTML

# Step 1: UPLOAD FILE
uploaded = files.upload()
filename = next(iter(uploaded))

# Read file bytes
with open(filename, "rb") as f:
    raw_bytes = f.read()

orig_len = len(raw_bytes)
orig_sha256 = hashlib.sha256(raw_bytes).hexdigest()

# Shannon–Fano Helper Functions

# Frequency table
def build_frequency_table(data):
    return Counter(data)

# Shannon–Fano Recursive Coding
def shannon_fano(symbol_freq):
    # Sort symbols by descending frequency
    symbols = sorted(symbol_freq.items(), key=lambda x: x[1], reverse=True)
    codes = {s: "" for s, _ in symbols}

    def assign_code(symbols_list):
        if len(symbols_list) <= 1:
            return

        total = sum(freq for _, freq in symbols_list)
        running = 0
        split_index = 0

        # Find best split
        for i, (_, freq) in enumerate(symbols_list):
            running += freq
            if running >= total / 2:
                split_index = i
                break

        left = symbols_list[:split_index + 1]
        right = symbols_list[split_index + 1:]

        # Assign prefix
        for s, _ in left:
            codes[s] += "0"
        for s, _ in right:
            codes[s] += "1"

        assign_code(left)
        assign_code(right)

    assign_code(symbols)
    return codes

# Compress
def shannon_fano_compress(data):
    freq = build_frequency_table(data)
    code_table = shannon_fano(freq)

    # Build encoded bitstring
    bitstring = "".join(code_table[b] for b in data)

    # Convert bitstring → bytes
    out_bytes = bytearray()
    for i in range(0, len(bitstring), 8):
        byte = bitstring[i:i+8]
        if len(byte) < 8:
            byte = byte.ljust(8, "0")
        out_bytes.append(int(byte, 2))

    return code_table, bitstring, bytes(out_bytes)

# Decompress
def shannon_fano_decompress(encoded_bytes, code_table, original_length):
    # Build reverse lookup
    reverse = {v: k for k, v in code_table.items()}

    # Convert bytes to bitstring
    bitstring = "".join(f"{byte:08b}" for byte in encoded_bytes)

    output = []
    buffer = ""

    for bit in bitstring:
        buffer += bit
        if buffer in reverse:
            output.append(reverse[buffer])
            buffer = ""
            if len(output) == original_length:
                break

    return bytes(output)

# Run compression

code_table, bitstring, compressed_bytes = shannon_fano_compress(raw_bytes)

# Save compressed file
with open("sf_compressed.bin", "wb") as f:
    f.write(compressed_bytes)

# Decompress
restored_bytes = shannon_fano_decompress(compressed_bytes, code_table, orig_len)

with open("sf_restored.bin", "wb") as f:
    f.write(restored_bytes)

# Save text versions (Latin-1)
def bin_to_txt(src, dst):
    with open(src, "rb") as f_in, open(dst, "w", encoding="latin-1") as f_out:
        f_out.write(f_in.read().decode("latin-1"))

bin_to_txt("sf_compressed.bin", "sf_compressed.txt")
bin_to_txt("sf_restored.bin", "sf_restored.txt")

# Stats
comp_size = len(compressed_bytes)
ratio = comp_size / orig_len if orig_len else 0
rest_sha256 = hashlib.sha256(restored_bytes).hexdigest()
match = (raw_bytes == restored_bytes)

# Display UI (like your RNN project)

html = f"""
<div style="
    border: 2px solid #8e44ad;
    border-radius: 12px;
    padding: 30px;
    width: 70%;
    margin: 30px auto;
    background: #f8f9fa;
    text-align: center;
    font-family: 'Segoe UI', Tahoma, sans-serif;
">
  <h1 style="color:#8e44ad; font-size:28px; margin-bottom:10px;">
    Shannon–Fano Compression (Existing System)
  </h1>

  <p style="font-size:20px;"><b>Original size:</b> {orig_len} bytes</p>
  <p style="font-size:20px;"><b>Compressed size:</b> {comp_size} bytes</p>
  <p style="font-size:20px;"><b>Compression ratio:</b> {ratio:.3f}×</p>
  <p style="font-size:20px;"><b>Match with original:</b> {match}</p>

  <div style="margin: 15px 0;">
    <p><b>SHA256 (Original):</b> {orig_sha256}</p>
    <p><b>SHA256 (Restored):</b> {rest_sha256}</p>
  </div>

  <button onclick="google.colab.kernel.invokeFunction('download_sf_comp', [], {{}})"
          style="background:#8e44ad;color:white;padding:12px 24px;border:none;
          border-radius:8px;cursor:pointer;margin-right:12px;font-size:16px;">
    Download sf_compressed.txt
  </button>

  <button onclick="google.colab.kernel.invokeFunction('download_sf_rest', [], {{}})"
          style="background:#2ecc71;color:white;padding:12px 24px;border:none;
          border-radius:8px;cursor:pointer;font-size:16px;">
    Download sf_restored.txt
  </button>
</div>
"""

from google.colab import output
display(HTML(html))

# Register callbacks
def download_sf_comp():
    files.download("sf_compressed.txt")

def download_sf_rest():
    files.download("sf_restored.txt")

output.register_callback('download_sf_comp', download_sf_comp)
output.register_callback('download_sf_rest', download_sf_rest)


Saving sample_text.txt to sample_text (1).txt
