# Installs


In [10]:
!pip install tqdm
!pip install -q datasets
# !pip install -q https://github.com/kpu/kenlm/archive/master.zip , no need as we do not use the perplexity calculation from the python module but it is always better to take a look :)



install kenlm OS requirements

In [11]:
!brew install cmake zlib bzip2 xz eigen boost

Running `brew update --auto-update`...
[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Casks[0m
avira-antivirus     keyfinder           scansion            snappy

You have [1m14[0m outdated formulae installed.
You can upgrade them with [1mbrew upgrade[0m
or list them with [1mbrew outdated[0m.

cmake 3.25.3 is already installed but outdated (so it will be upgraded).
To reinstall 1.2.13, run:
  brew reinstall zlib
To reinstall 1.0.8, run:
  brew reinstall bzip2
To reinstall 5.4.1, run:
  brew reinstall xz
To reinstall 3.4.0_1, run:
  brew reinstall eigen
To reinstall 1.81.0_1, run:
  brew reinstall boost
[32m==>[0m [1mFetching [32mcmake[39m[0m
[34m==>[0m [1mDownloading https://ghcr.io/v2/homebrew/core/cmake/manifests/3.26.0[0m
######################################################################## 100.0%
[34m==>[0m [1mDownloading https://ghcr.io/v2/homebrew/core/cmake/blobs/sha256:6fb143b21a37[0m
[

build kenlm from source

In [1]:
# %%shell
# if [ ! -d kenlm ]; then
#   wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
#   mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2
# else
#   echo kenlm already exist
# fi

UsageError: Cell magic `%%shell` not found.


# imports

In [2]:
import os
import psutil
import datasets
from tqdm.auto import tqdm

# Load Dataset

In [3]:
dataset = datasets.load_dataset("JulesBelveze/tldr_news")
dataset

No config specified, defaulting to: tldr_news/all
Found cached dataset tldr_news (/Users/waleedalasad/.cache/huggingface/datasets/JulesBelveze___tldr_news/all/1.2.0/3c113655b652c0d79c5963be201cf044c00f0bc9c04aefeb9ad11e530d46ef25)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'content', 'category'],
        num_rows: 7138
    })
    test: Dataset({
        features: ['headline', 'content', 'category'],
        num_rows: 794
    })
})

# Utility Functions

In [4]:
def estimate_memory_to_use_by_lm_modeler(verbose=True, margin=5):
  '''
  this funciton will estimate the memory to use by LM
  it calculates the current usage of memory adding some margin
  then return the remaining percentage of memory to be used
  '''
  # get the current memory used
  current_memory_usage = psutil.virtual_memory().percent
  # add some margin
  current_memory_usage += margin
  memory_to_use = int(100 - current_memory_usage)
  if verbose:
    print()
    print('#'*80)
    print('Estimating the LM model using:',f'{memory_to_use}% of memory.')
    print('#'*80)
    print()
  return memory_to_use

In [5]:
def train_lm_model(
    ngram:int,
    train_dataset:list,
    dataset_name:str,
    models_path='LMs',
    use_tqdm=True,
):
    !mkdir -p "{models_path}"/"{dataset_name}"/"{ngram}"
    
    print(" Writing train_dataset into file ".center(80, "#"))

    train_dataset = tqdm(train_dataset) if use_tqdm else train_dataset

    with open(
        f"{models_path}/{dataset_name}/train_dataset.txt",
        "w",
    ) as train_dataset_file:
        for item in train_dataset:
            train_dataset_file.write(item)
            train_dataset_file.write("\n")

    memory_to_use = estimate_memory_to_use_by_lm_modeler()

    !kenlm/build/bin/lmplz -o "{ngram}" -S "{memory_to_use}"% --discount_fallback < "{models_path}"/"{dataset_name}"/train_dataset.txt > "{models_path}"/"{dataset_name}"/"{ngram}"/lm.arpa

    print("#" * 80)
    print(" Converting .arpa file to binary file. ".center(80, "#"))

    !kenlm/build/bin/build_binary "{models_path}"/"{dataset_name}"/"{ngram}"/lm.arpa "{models_path}"/"{dataset_name}"/"{ngram}"/lm.binary

In [6]:
def get_perplexity_and_OOVs(
    ngram:int,
    test_dataset:list,
    dataset_name:str,
    models_path='LMs',
    overwrite_files=False,
    print_to_console=True,
):
    
    with open(
        f"{models_path}/{dataset_name}/test_dataset.txt",
        "w",
    ) as test_dataset_file:
        test_dataset_file.write("\n".join(test_dataset))

    # calculate and dump to a file
    !kenlm/build/bin/query "{models_path}"/"{dataset_name}"/"{ngram}"/lm.binary < "{models_path}"/"{dataset_name}"/test_dataset.txt > "{models_path}"/"{dataset_name}"/results.txt

    with open(f"{models_path}/{dataset_name}/results.txt") as f:
        lines = f.read().splitlines()

    # collect
    perplexity_with_OOVs_line = lines[-4]
    perplexity_without_OOVs_line = lines[-3]
    counts_of_OOVs_line = lines[-2]

    perplexity_with_OOVs = float(perplexity_with_OOVs_line.split("Perplexity including OOVs:")[-1].strip())
    perplexity_without_OOVs = float(perplexity_without_OOVs_line.split("Perplexity excluding OOVs:")[-1].strip())
    counts_of_OOVs = int(counts_of_OOVs_line.split("OOVs:")[-1].strip())

    return perplexity_with_OOVs, perplexity_without_OOVs, counts_of_OOVs

# Train and collect the results

In [7]:
def process_dataset(dataset:list):
  processed_dataset = []
  for document in dataset:
    if len(document)==0:
      continue
    processed_dataset.append(document)
  return processed_dataset

In [8]:
train_dataset = process_dataset(dataset['train']['content'])
len(train_dataset)

7138

In [17]:
train_lm_model(
    ngram=3,
    dataset_name='news',
    train_dataset=train_dataset,
)

####################### Writing train_dataset into file ########################


  0%|          | 0/7138 [00:00<?, ?it/s]


################################################################################
Estimating the LM model using: 28% of memory.
################################################################################

=== 1/5 Counting and sorting n-grams ===
Reading stdin
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 564020 types 47609
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:571308 2:1672971136 3:3136820992
Statistics:
1 47609 D1=0.641568 D2=1.11291 D3+=1.4063
2 283707 D1=0.812303 D2=1.15936 D3+=1.43938
3 468629 D1=0.909354 D2=1.28055 D3+=1.33798
Memory estimate for binary LM:
type       kB
probing 16095 assuming -p 1.5
probing 17944 assuming -r models -p 1.5
trie     7198 without quantization
trie     4258 assuming -q 8 -b 8 quantization 
trie     6805 assuming -a 22 array pointer compression
trie 

In [13]:
test_dataset = process_dataset(dataset=dataset['test']['content'])
len(test_dataset)

794

In [22]:
def build_model(n_gram, train_data_path):
    !mkdir -p "LMs"/"{n_gram}"
    !kenlm/build/bin/lmplz --text "{train_data_path}" --arpa "LMs"/"{n_gram}"/UNCorpus.train.tok.3.arpa --order {n_gram} --discount_fallback  --verbose_header 
    
build_model(1, 'English-Mix/UNCorpus.train.tok')
build_model(2, 'English-Mix/UNCorpus.train.tok')
build_model(3, 'English-Mix/UNCorpus.train.tok')

=== 1/5 Counting and sorting n-grams ===
Reading fd 3
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 2933681 types 17863
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:214356
Statistics:
1 17863 D1=0.532532 D2=1.06821 D3+=1.37723
Memory estimate for binary LM:
type     kB
probing 767 assuming -p 1.5
probing 837 assuming -r models -p 1.5
trie    519 without quantization
trie    467 assuming -q 8 -b 8 quantization 
trie    519 assuming -a 22 array pointer compression
trie    467 assuming -a 22 -q 8 -b 8 array pointer compression and quantization
=== 3/5 Calculating and sorting initial probabilities ===
Chain sizes: 1:214356
=== 4/5 Calculating and writing order-interpolated probabilities ===
Chain sizes: 1:214356
=== 5/5 Writing ARPA model ===
----5---10---15---20---25---30---35---40---45---50---55---

In [14]:
perplexity_with_OOVs, perplexity_without_OOVs, counts_of_OOVs = get_perplexity_and_OOVs(ngram=4,test_dataset=test_dataset,dataset_name='news')
print('-'*80)
f'{perplexity_with_OOVs=}, {perplexity_without_OOVs=}, {counts_of_OOVs=}'

This binary file contains probing hash tables.
query	31326208	30932992	RSSMax:31326208 kB	user:0.021048	sys:0.009426	CPU:0.030491	real:0.027348
--------------------------------------------------------------------------------


'perplexity_with_OOVs=576.4342004619074, perplexity_without_OOVs=391.5395051631819, counts_of_OOVs=3312'