<a href="https://colab.research.google.com/github/Vijayalakshmii04/ATM-Software/blob/main/adct_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch numpy




In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import heapq
from collections import Counter


In [None]:
class TextTokenizer:
    def __init__(self):
        self.special = {'<PAD>':0, '<UNK>':1, '<START>':2, '<END>':3}
        self.char_to_idx = dict(self.special)
        self.idx_to_char = {v:k for k,v in self.char_to_idx.items()}

    def fit(self, text):
        chars = sorted(set(text))
        offset = len(self.special)
        for i,c in enumerate(chars):
            self.char_to_idx[c] = i + offset

        self.idx_to_char = {v:k for k,v in self.char_to_idx.items()}

    def encode(self, text, add_special=True):
        ids = [self.char_to_idx.get(c,1) for c in text]
        if add_special:
            return [2] + ids + [3]
        return ids

    def decode(self, ids):
        return ''.join(self.idx_to_char.get(i,"") for i in ids)


In [None]:
class LSTMPredictor(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim,
                            batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out,_ = self.lstm(x)
        logits = self.fc(out)
        probs = F.softmax(logits, dim=-1)
        return probs


In [None]:
def train_model(model, data, epochs=20):
    opt = torch.optim.Adam(model.parameters(), lr=0.01)
    loss_fn = nn.CrossEntropyLoss()

    for ep in range(epochs):
        x = torch.tensor([data[:-1]], dtype=torch.long)
        y = torch.tensor([data[1:]], dtype=torch.long)

        opt.zero_grad()
        probs = model(x)
        loss = loss_fn(probs.reshape(-1, probs.size(-1)),
                       y.reshape(-1))
        loss.backward()
        opt.step()

        if ep % 5 == 0:
            print("Epoch", ep, "Loss:", loss.item())


In [None]:
def arithmetic_encode(text_ids, prob_seq):
    low, high = 0.0, 1.0

    for i, true_id in enumerate(text_ids):
        probs = prob_seq[i]  # list of probabilities
        cum = 0.0

        for idx, pr in enumerate(probs):
            if idx == true_id:
                new_low  = low + (high - low) * cum
                new_high = low + (high - low) * (cum + pr)
                low, high = new_low, new_high
                break
            cum += pr

    return (low + high) / 2


In [None]:
class Node:
    def __init__(self, char=None, freq=None, left=None, right=None):
        self.char = char
        self.freq = freq
        self.left = left
        self.right = right

    def __lt__(self, other):
        return self.freq < other.freq

def build_huffman(text):
    freqs = Counter(text)
    heap = [Node(c,f) for c,f in freqs.items()]
    heapq.heapify(heap)

    while len(heap)>1:
        a = heapq.heappop(heap)
        b = heapq.heappop(heap)
        merged = Node(None, a.freq + b.freq, a, b)
        heapq.heappush(heap, merged)

    return heap[0]

def generate_codes(node, prefix="", codes={}):
    if node.char:
        codes[node.char]=prefix or "0"
    else:
        generate_codes(node.left, prefix+"0", codes)
        generate_codes(node.right, prefix+"1", codes)
    return codes


In [None]:
def compress_text(input_text):
    tokenizer = TextTokenizer()
    tokenizer.fit(input_text)
    encoded = tokenizer.encode(input_text)

    print("Input Text:", input_text)
    print("Token IDs:", encoded)

    # ---- Train LSTM ----
    model = LSTMPredictor(len(tokenizer.char_to_idx))
    train_model(model, encoded, epochs=15)

    # ---- Get LSTM probabilities ----
    x = torch.tensor([encoded[:-1]], dtype=torch.long)
    probs = model(x).detach().numpy()[0]   # shape = (len-1, vocab_size)

    prob_seq = []  # list of probability LISTS

    print("\n===== LSTM TOP-3 PROBABILITIES =====")
    for i, p in enumerate(probs):

        # save correct probability format (list)
        prob_seq.append(list(p))

        # print readable top-3
        char_probs = [(tokenizer.idx_to_char[idx], float(p[idx]))
                      for idx in range(len(p))]
        top3 = sorted(char_probs, key=lambda x: x[1], reverse=True)[:3]

        print(f"\nAt position {i} after '{tokenizer.idx_to_char[encoded[i]]}':")
        for char, pr in top3:
            print(f"   {char}: {pr:.4f}")

    # ---- Arithmetic Encoding ----
    print("\n===== ARITHMETIC ENCODED TAG =====")
    tag = arithmetic_encode(encoded[1:], prob_seq)   # correct input format
    print("Tag:", tag)

    # ---- Huffman Codes ----
    print("\n===== HUFFMAN CODES =====")
    huff_tree = build_huffman(input_text)
    huff_codes = generate_codes(huff_tree)
    print(huff_codes)

    return tag, huff_codes



In [None]:
sample_text = "Vellore Institute of Technology"
tag, huffman = compress_text(sample_text)




Input Text: Vellore Institute of Technology
Token IDs: [2, 7, 9, 14, 14, 16, 17, 9, 4, 5, 15, 18, 19, 13, 19, 20, 19, 9, 4, 16, 10, 4, 6, 9, 8, 12, 15, 16, 14, 16, 11, 21, 3]
Epoch 0 Loss: 3.090259552001953
Epoch 5 Loss: 2.739793062210083
Epoch 10 Loss: 2.3846354484558105

===== LSTM TOP-3 PROBABILITIES =====

At position 0 after '<START>':
   V: 0.6098
   e: 0.2554
   l: 0.0249

At position 1 after 'V':
   e: 0.9966
   l: 0.0014
   V: 0.0013

At position 2 after 'e':
   l: 0.9939
   c: 0.0029
   e: 0.0011

At position 3 after 'l':
   l: 0.9664
   o: 0.0311
   e: 0.0015

At position 4 after 'l':
   o: 0.9904
   l: 0.0055
   r: 0.0031

At position 5 after 'o':
   r: 0.9744
   g: 0.0190
   e: 0.0019

At position 6 after 'r':
   e: 0.9945
    : 0.0024
   r: 0.0022

At position 7 after 'e':
    : 0.9891
   I: 0.0039
   e: 0.0029

At position 8 after ' ':
   I: 0.9890
   n: 0.0025
   o: 0.0020

At position 9 after 'I':
   n: 0.9803
   I: 0.0082
   s: 0.0068

At position 10 after 'n':
   s: 