# N-Gram Models

N-gram is a contiguous sequence of n items from a text or speech sample which can be letters, syllables, or words. It is a simple form of langauge modelling that can capture context and statistical patterns.

In [2]:
!uv pip install pandas
!uv pip install datasets

[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 236ms[0m[0m
[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 117ms[0m[0m


In [3]:
import random
from collections import Counter, defaultdict
import textwrap
import pandas as pd

## Load in Naijaweb Dataset [1]

In [4]:
from datasets import load_dataset

naija_web = load_dataset("saheedniyi/naijaweb")
dataset = naija_web["train"]["text"][:500]
print(f"The dataset consists of {len(dataset)} paragraphs.")

The dataset consists of 500 paragraphs.


In [5]:
for paragraph in dataset[:10]:
    formatted_paragraph = textwrap.fill(paragraph)
    print(f"{formatted_paragraph}\n")

Governor Samuel Ortom of Benue State By Peter Duru Governor Samuel
Ortom of Benue state has commended President Muhammadu Buhari for his
directive to security agents to shoot anyone illegally bearing AK47
rifle in the country. The Governor who gave the commendation Thursday
in Makurdi said the President’s order would reduce the level of
criminality, banditry and militia herders’ attacks on Benue
communities as well as in other parts of the country. According to
him, “the order would also make the communities safer for displaced
farmers to return to their ancestral homes. “I wish to commend Mr.
President for his recent order against those bearing AK47 rifles. This
I am sure will reduce the high rate of criminality, banditary and
militia herdsmen attacks on our farming communities,” the Governor
said. He noted that President Buhari had done the right thing by
listening to the calls he and other concerned Nigerians made on the
need for the Federal Government to act faster and decisively t

## Building blocks for the n-gram model

In [6]:
from typing import List, Dict

def space_tokenizer(text: str) -> List[str]:
  """Splits a string into a list of words (tokens)"""
  return text.split()


In [7]:
space_tokenizer(dataset[0])

['Governor',
 'Samuel',
 'Ortom',
 'of',
 'Benue',
 'State',
 'By',
 'Peter',
 'Duru',
 'Governor',
 'Samuel',
 'Ortom',
 'of',
 'Benue',
 'state',
 'has',
 'commended',
 'President',
 'Muhammadu',
 'Buhari',
 'for',
 'his',
 'directive',
 'to',
 'security',
 'agents',
 'to',
 'shoot',
 'anyone',
 'illegally',
 'bearing',
 'AK47',
 'rifle',
 'in',
 'the',
 'country.',
 'The',
 'Governor',
 'who',
 'gave',
 'the',
 'commendation',
 'Thursday',
 'in',
 'Makurdi',
 'said',
 'the',
 'President’s',
 'order',
 'would',
 'reduce',
 'the',
 'level',
 'of',
 'criminality,',
 'banditry',
 'and',
 'militia',
 'herders’',
 'attacks',
 'on',
 'Benue',
 'communities',
 'as',
 'well',
 'as',
 'in',
 'other',
 'parts',
 'of',
 'the',
 'country.',
 'According',
 'to',
 'him,',
 '“the',
 'order',
 'would',
 'also',
 'make',
 'the',
 'communities',
 'safer',
 'for',
 'displaced',
 'farmers',
 'to',
 'return',
 'to',
 'their',
 'ancestral',
 'homes.',
 '“I',
 'wish',
 'to',
 'commend',
 'Mr.',
 'President

In [8]:
def generate_ngrams(text: str, n: int) -> List[tuple]:
  """Generates n-grams from a paragraph"""

  tokens = space_tokenizer(text)

  num_of_tokens = len(tokens)

  ngrams = [tuple(tokens[i:i+n]) for  i in range(0, num_of_tokens - n + 1)]


  return ngrams

In [9]:
generate_ngrams(dataset[0], 2)

[('Governor', 'Samuel'),
 ('Samuel', 'Ortom'),
 ('Ortom', 'of'),
 ('of', 'Benue'),
 ('Benue', 'State'),
 ('State', 'By'),
 ('By', 'Peter'),
 ('Peter', 'Duru'),
 ('Duru', 'Governor'),
 ('Governor', 'Samuel'),
 ('Samuel', 'Ortom'),
 ('Ortom', 'of'),
 ('of', 'Benue'),
 ('Benue', 'state'),
 ('state', 'has'),
 ('has', 'commended'),
 ('commended', 'President'),
 ('President', 'Muhammadu'),
 ('Muhammadu', 'Buhari'),
 ('Buhari', 'for'),
 ('for', 'his'),
 ('his', 'directive'),
 ('directive', 'to'),
 ('to', 'security'),
 ('security', 'agents'),
 ('agents', 'to'),
 ('to', 'shoot'),
 ('shoot', 'anyone'),
 ('anyone', 'illegally'),
 ('illegally', 'bearing'),
 ('bearing', 'AK47'),
 ('AK47', 'rifle'),
 ('rifle', 'in'),
 ('in', 'the'),
 ('the', 'country.'),
 ('country.', 'The'),
 ('The', 'Governor'),
 ('Governor', 'who'),
 ('who', 'gave'),
 ('gave', 'the'),
 ('the', 'commendation'),
 ('commendation', 'Thursday'),
 ('Thursday', 'in'),
 ('in', 'Makurdi'),
 ('Makurdi', 'said'),
 ('said', 'the'),
 ('the', 

In [10]:
generate_ngrams(dataset[1], 3)

[('NewsHelm.com', 'offers', 'a'),
 ('offers', 'a', 'unique'),
 ('a', 'unique', 'blend'),
 ('unique', 'blend', 'of'),
 ('blend', 'of', 'modernity'),
 ('of', 'modernity', 'and'),
 ('modernity', 'and', 'trustworthiness.'),
 ('and', 'trustworthiness.', 'Its'),
 ('trustworthiness.', 'Its', 'short'),
 ('Its', 'short', 'and'),
 ('short', 'and', 'catchy'),
 ('and', 'catchy', 'name'),
 ('catchy', 'name', 'instantly'),
 ('name', 'instantly', 'conveys'),
 ('instantly', 'conveys', 'the'),
 ('conveys', 'the', 'idea'),
 ('the', 'idea', 'of'),
 ('idea', 'of', 'a'),
 ('of', 'a', 'news'),
 ('a', 'news', 'or'),
 ('news', 'or', 'media'),
 ('or', 'media', 'platform.'),
 ('media', 'platform.', 'This'),
 ('platform.', 'This', 'domain'),
 ('This', 'domain', 'name'),
 ('domain', 'name', 'is'),
 ('name', 'is', 'ideal'),
 ('is', 'ideal', 'for'),
 ('ideal', 'for', 'businesses'),
 ('for', 'businesses', 'in'),
 ('businesses', 'in', 'the'),
 ('in', 'the', 'news'),
 ('the', 'news', 'industry,'),
 ('news', 'industry,

In [11]:
import time

all_unigrams = []
all_bigrams = []
all_trigrams = []


for paragraph in dataset:
  all_unigrams.extend(generate_ngrams(paragraph, 1))
  all_bigrams.extend(generate_ngrams(paragraph, 2))
  all_trigrams.extend(generate_ngrams(paragraph, 3))

  if len(all_unigrams) > 10000:
    time.sleep(1)

In [12]:
bigram_counts = Counter(all_bigrams)

print("Most common bigrams:")
for bigram, count in bigram_counts.most_common(10):
    print(f"  ({bigram}, {count})")

trigram_counts = Counter(all_trigrams)

print("\n\nMost common trigrams:")
for trigram, count in trigram_counts.most_common(10):
    print(f"  ({trigram}, {count})")

Most common bigrams:
  (('of', 'the'), 3510)
  (('in', 'the'), 1763)
  (('to', 'the'), 1148)
  (('that', 'the'), 917)
  (('for', 'the'), 725)
  (('on', 'the'), 706)
  (('by', 'the'), 654)
  (('at', 'the'), 542)
  (('with', 'the'), 534)
  (('and', 'the'), 529)


Most common trigrams:
  (('Comments', 'expressed', 'here'), 292)
  (('expressed', 'here', 'do'), 292)
  (('here', 'do', 'not'), 292)
  (('do', 'not', 'reflect'), 292)
  (('not', 'reflect', 'the'), 292)
  (('reflect', 'the', 'opinions'), 292)
  (('the', 'opinions', 'of'), 292)
  (('opinions', 'of', 'Vanguard'), 292)
  (('of', 'Vanguard', 'newspapers'), 292)
  (('Vanguard', 'newspapers', 'or'), 292)


In [13]:
def generate_ngram_counts(dataset: List[str], n: int) -> Dict[str, Counter]:
  """Generates n-gram counts for the Naijaweb dataset"""

  ngram_counts = defaultdict(Counter)

  for paragraph in dataset:

      ngrams_list = generate_ngrams(paragraph, n)

      for ngram in ngrams_list:
          context = " ".join(ngram[:-1])
          ngram_counts[context][ngram[-1]] += 1

  return dict(ngram_counts)

In [14]:
sample_dataset = dataset[:2]

generate_ngram_counts(sample_dataset, 3)

{'Governor Samuel': Counter({'Ortom': 2}),
 'Samuel Ortom': Counter({'of': 2}),
 'Ortom of': Counter({'Benue': 2}),
 'of Benue': Counter({'State': 1, 'state': 1}),
 'Benue State': Counter({'By': 1}),
 'State By': Counter({'Peter': 1}),
 'By Peter': Counter({'Duru': 1}),
 'Peter Duru': Counter({'Governor': 1}),
 'Duru Governor': Counter({'Samuel': 1}),
 'Benue state': Counter({'has': 1}),
 'state has': Counter({'commended': 1}),
 'has commended': Counter({'President': 1}),
 'commended President': Counter({'Muhammadu': 1}),
 'President Muhammadu': Counter({'Buhari': 1}),
 'Muhammadu Buhari': Counter({'for': 1}),
 'Buhari for': Counter({'his': 1}),
 'for his': Counter({'directive': 1, 'recent': 1}),
 'his directive': Counter({'to': 1}),
 'directive to': Counter({'security': 1}),
 'to security': Counter({'agents': 1}),
 'security agents': Counter({'to': 1}),
 'agents to': Counter({'shoot': 1}),
 'to shoot': Counter({'anyone': 1}),
 'shoot anyone': Counter({'illegally': 1}),
 'anyone illega

In [None]:
def build_ngram_model(
    dataset: List[str],
    n: int
) -> Dict[str, Dict[str, float]]:
    """Build an n-gram language model."""

    ngram_model = defaultdict(dict)

    ngram_counts = generate_ngram_counts(dataset, n)

    for context, next_tokens in ngram_counts.items():
        total = sum(next_tokens.values())
        for next_token, counts in next_tokens.items():
          ngram_model[context][next_token] = counts/total

    return ngram_model

# Testing the function
test_dataset = ["The governor is tall.", "The governor is greedy."]
test_trigram_model = build_ngram_model(test_dataset, n=3)
test_trigram_model

{'The governor': {'is': 1.0}, 'governor is': {'tall.': 0.5, 'greedy.': 0.5}}

In [21]:
def generate_output(
    prompt: str,
    n: int,
    num_of_tokens: int,
    ngram_model: Dict[str, Dict[str, float]]
) -> str:
    """Generates new token output for ngram model"""

    generated_tokens = space_tokenizer(prompt)

    for _ in range(num_of_tokens):
        context = generated_tokens[-(n - 1):]
        context = " ".join(context)
        if context in ngram_model:
            next_word = random.choices(
                list(ngram_model[context].keys()),
                weights=ngram_model[context].values()
            )[0]

            generated_tokens.append(next_word)
        else:
            print("No valid continuation found")
            break

    return " ".join(generated_tokens)


## Inference with new n-gram model

In [25]:
prompt = "he lamented"

trigram_model = build_ngram_model(dataset, n=3)

output = generate_output(prompt, n=3, num_of_tokens=20, ngram_model=trigram_model)

print(output)

he lamented as the refusal of Imo State. With the sum of $75 million because the parents and relatives, colleagues and generals


## References

Dataset Used

[1] Saheed Azeez. (2024). *Naijaweb: A Web Scraped Nigerian Context Dataset* (Version 1.0.0). Hugging Face Datasets. Available at: [https://huggingface.co/datasets/saheedniyi/naijaweb](https://huggingface.co/datasets/saheedniyi/naijaweb)