In [1]:
import torch
import transformers
from transformers import AutoTokenizer, GPT2LMHeadModel


In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
model = model.to(device)

In [5]:
vocab = tokenizer.get_vocab()
index_to_token = {x:y for (y, x) in vocab.items()}

In [6]:
def encode(text):
  inputs = tokenizer(text, return_tensors="pt", verbose=True)
  inputs = inputs.to(device)
  outputs = model(**inputs, labels=inputs["input_ids"])
  logits = outputs.logits
  logits = logits[0]

  indices = [inputs['input_ids'][0][0].detach().item()]
  for token, predicted in zip(inputs['input_ids'][0][1:], logits):
    token = token.detach().item()
    predictions = [(index, score) for index, score in enumerate(predicted.detach().tolist())]
    predictions.sort(key=lambda x : (x[1], x[0]), reverse = True)
    for i, p in enumerate(predictions):
      if p[0] == token:
        indices.append(i)
  return indices


In [7]:
def decode(indices):
  input_ids = [indices[0]] + [0]*(len(indices)-1)
  attention_mask = torch.tensor([[1]*len(indices)], device=device)
  output = [index_to_token[indices[0]]]

  for i, index in enumerate(indices[1:]):
    ids = torch.tensor([input_ids], device=device)
    outputs = model(input_ids=ids, attention_mask=attention_mask, labels=ids)
    logits = outputs.logits

    predicted = logits[0, i, :]
    predictions = [(index, score) for index, score in enumerate(predicted.detach().tolist())]
    predictions.sort(key=lambda x:(x[1], x[0]), reverse = True)
    word_index = predictions[index][0]
    word = tokenizer.decode([word_index])
    output.append(word)
    input_ids[i+1] = word_index
  return output


In [8]:
def single_encode(ids):
  ids = torch.reshape(ids, (1, -1)).to(device)
  with torch.no_grad():
    outputs = model(input_ids= ids, attention_mask= torch.tensor([1]* len(ids)).to(device), labels=ids)
    logits = outputs.logits
    logits = logits[0]

    indices = []
    for token, predicted in zip(ids[0][1:], logits):
      token = token.detach().item()
      predictions = [(index, score) for index, score in enumerate(predicted.detach().tolist())]
      predictions.sort(key=lambda x : (x[1], x[0]), reverse = True)
      for i, p in enumerate(predictions):
        if p[0] == token:
          indices.append(i)
  return indices


In [9]:
def window_encode(text, window_length, overlap):
  inputs = tokenizer(text, return_tensors="pt", verbose=True)
  input_ids = inputs['input_ids'][0]
  indices = [input_ids[0].detach().item()] + [0] * (len(input_ids)-1)
  window_start = 0
  window_end = window_length
  prev_end = 1
  while True:
    window_end = min(window_end, len(input_ids))
    cur_indices = single_encode(input_ids[window_start: window_end])
    indices[prev_end:window_end] = cur_indices[-(window_end-prev_end):]
    if window_end == len(input_ids):
      break

    prev_end = window_end

    window_start = window_end-overlap
    window_end = window_start+window_length

  return indices

In [10]:
def window_decode(indices, window_length, overlap):
  input_ids = [indices[0]]
  attention_mask = [1]
  output = [index_to_token[indices[0]]]

  for index in indices[1:]:
    if len(input_ids) == window_length:
      input_ids = input_ids[-overlap:]
      attention_mask = [1]*overlap
    inputs = {'input_ids': torch.tensor([input_ids]).to(device), 'attention_mask': torch.tensor([attention_mask]).to(device)}
    inputs_class = transformers.tokenization_utils_base.BatchEncoding(inputs).to(device)
    with torch.no_grad():
      outputs = model(**inputs_class, labels=inputs["input_ids"])
      logits = outputs.logits

      predicted = logits[0, -1, :]
      predictions = [(index, score) for index, score in enumerate(predicted.detach().tolist())]
      predictions.sort(key=lambda x:(x[1], x[0]), reverse = True)
      word_index = predictions[index][0]
      word = tokenizer.decode([word_index])
      output.append(word)
      input_ids.append(word_index)
      attention_mask.append(1)
  return output

In [11]:
from google.protobuf.internal import encoder, decoder

def encode_indices(indices, encoded_filename):
    with open(encoded_filename, 'wb') as file:
        for number in indices:
            encoded_bytes = encoder._VarintBytes(number)
            file.write(encoded_bytes)

def decode_indices(filename):
    indices = []
    with open(filename, 'rb') as file:
        encoded_bytes = file.read()
        position = 0
        while position < len(encoded_bytes):
            number, position = decoder._DecodeVarint(encoded_bytes, position)
            indices.append(number)
    return indices

In [12]:
import os

def compress_file(filename, output_file):
  with open(filename, 'r', encoding='utf8') as file:
    text = file.read()
    # encode_indices(window_encode(text, 32, 4), output_file)
    encode_indices(encode(text), output_file)

    original_size = os.path.getsize(filename)
    compressed_size = os.path.getsize(output_file)

    print("Original size:", original_size, "bytes")
    print("Compressed size:", compressed_size, "bytes")
    print("Compression ratio:", compressed_size/original_size)



In [13]:
def decompress_file(compressed_file, output_file):
  indices = decode_indices(compressed_file)
  # text = ''.join(window_decode(indices, 32, 4))
  text = ''.join(decode(indices))
  with open(output_file, 'w', encoding='utf8') as file:
    file.write(text)

In [14]:
compress_file('tolstoi.txt', 'tolstoi.bin')

Original size: 2591 bytes
Compressed size: 711 bytes
Compression ratio: 0.27441142416055575


In [15]:
decompress_file('tolstoi.bin', 'original.txt')

In [2]:
import os
import gzip
def compress_gzip(filename):
  compressed_filename = filename+'.gz'
  with open(filename, 'rb') as file:
        content = file.read()
  with gzip.open(compressed_filename, 'wb') as compressed_file:
        compressed_file.write(content)
  original_size = os.path.getsize(filename)
  print(original_size)
  compressed_size = os.path.getsize(compressed_filename)
  print(compressed_size)
  compression_ratio = compressed_size/original_size
  return (compressed_filename, compression_ratio)


In [3]:
file, ratio = compress_gzip('tolstoi.txt')
print(ratio)

2591
1376
0.5310690852952528
