<a href="https://colab.research.google.com/github/Tekleab15/N-gram-Language-Models/blob/main/N_gram_language_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import tensorflow as tf
import re, csv
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/Amharic_Corpus/Copy_of_GPAC.txt'

# Text cleaning
def clean_text(text):
    return re.sub(r'[^ሀ-ፐ0-9\s\-\.,!?]', '', text)
def create_ngrams(tokens, n):
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
unigrams = []
bigrams = []
trigrams = []
fourgrams = []
# Read and process the file line by line
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        cleaned_line = clean_text(line)
        tokens = cleaned_line.split()
        # Generate unigrams and add to the list
        unigrams.extend(create_ngrams(tokens, 1))
        bigrams.extend(create_ngrams(tokens, 2))
        trigrams.extend(create_ngrams(tokens, 3))
        fourgrams.extend(create_ngrams(tokens, 4))
        # Print progress (optional)
        if len(unigrams) % 1000 == 0:
            print(f"Processed {len(unigrams)} unigrams...")

print("Unigram: ",unigrams[:10])
print("Bigram: ",bigrams[:10])
print("Trigram: ",trigrams[:10])
print("Fourgram: ",fourgrams[:10])

Unigram:  ['ምን', 'መሰላችሁ?', 'አንባቢያን', 'ኢትዮጵያ', 'በተደጋጋሚ', 'ጥሪው', 'ደርሷት', 'ልትታደመው', 'ያልቻለችው', 'የአለም']
Bigram:  ['ምን መሰላችሁ?', 'መሰላችሁ? አንባቢያን', 'አንባቢያን ኢትዮጵያ', 'ኢትዮጵያ በተደጋጋሚ', 'በተደጋጋሚ ጥሪው', 'ጥሪው ደርሷት', 'ደርሷት ልትታደመው', 'ልትታደመው ያልቻለችው', 'ያልቻለችው የአለም', 'የአለም የእግር']
Trigram:  ['ምን መሰላችሁ? አንባቢያን', 'መሰላችሁ? አንባቢያን ኢትዮጵያ', 'አንባቢያን ኢትዮጵያ በተደጋጋሚ', 'ኢትዮጵያ በተደጋጋሚ ጥሪው', 'በተደጋጋሚ ጥሪው ደርሷት', 'ጥሪው ደርሷት ልትታደመው', 'ደርሷት ልትታደመው ያልቻለችው', 'ልትታደመው ያልቻለችው የአለም', 'ያልቻለችው የአለም የእግር', 'የአለም የእግር ኳስ']
Fourgram:  ['ምን መሰላችሁ? አንባቢያን ኢትዮጵያ', 'መሰላችሁ? አንባቢያን ኢትዮጵያ በተደጋጋሚ', 'አንባቢያን ኢትዮጵያ በተደጋጋሚ ጥሪው', 'ኢትዮጵያ በተደጋጋሚ ጥሪው ደርሷት', 'በተደጋጋሚ ጥሪው ደርሷት ልትታደመው', 'ጥሪው ደርሷት ልትታደመው ያልቻለችው', 'ደርሷት ልትታደመው ያልቻለችው የአለም', 'ልትታደመው ያልቻለችው የአለም የእግር', 'ያልቻለችው የአለም የእግር ኳስ', 'የአለም የእግር ኳስ ዋ']


In [None]:
def write_ngram_counts_to_disk(file_path, n, output_file):
    with open(file_path, 'r', encoding='utf-8') as f, open(output_file, 'w', newline='', encoding='utf-8') as out_f:
        writer = csv.writer(out_f)
        chunk_size = 1000
        lines = []

        for i, line in enumerate(f):
            cleaned_line = clean_text(line)
            tokens = cleaned_line.split()
            lines.append(tokens)

            if (i + 1) % chunk_size == 0:
                ngram_counts = {}
                for tokens in lines:
                    ngrams = create_ngrams(tokens, n)
                    for ngram in ngrams:
                        ngram_counts[ngram] = ngram_counts.get(ngram, 0) + 1
                writer.writerows(ngram_counts.items())
                lines = []
                print(f"Processed {i + 1} lines...")

        if lines:
            ngram_counts = {}
            for tokens in lines:
                ngrams = create_ngrams(tokens, n)
                for ngram in ngrams:
                    ngram_counts[ngram] = ngram_counts.get(ngram, 0) + 1
            writer.writerows(ngram_counts.items())

# Write n-gram counts to disk
write_ngram_counts_to_disk(file_path, 1, 'unigrams.csv')
write_ngram_counts_to_disk(file_path, 2, 'bigrams.csv')
write_ngram_counts_to_disk(file_path, 3, 'trigrams.csv')
write_ngram_counts_to_disk(file_path, 4, 'fourgrams.csv')


In [None]:
def calculate_total_counts(file_path):
    total_count = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            total_count += int(row[1])
    return total_count

total_unigrams = calculate_total_counts('unigrams.csv')
total_bigrams = calculate_total_counts('bigrams.csv')
total_trigrams = calculate_total_counts('trigrams.csv')
total_fourgrams = calculate_total_counts('fourgrams.csv')

print(f"Total unigrams: {total_unigrams}")
print(f"Total bigrams: {total_bigrams}")
print(f"Total trigrams: {total_trigrams}")
print(f"Total fourgrams: {total_fourgrams}")

def calculate_probabilities_from_disk(file_path, total_count):
    ngram_prob = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            ngram, count = row
            ngram_prob[ngram] = int(count) / total_count
    return ngram_prob

unigram_prob = calculate_probabilities_from_disk('unigrams.csv', total_unigrams)
bigram_prob = calculate_probabilities_from_disk('bigrams.csv', total_bigrams)
trigram_prob = calculate_probabilities_from_disk('trigrams.csv', total_trigrams)
fourgram_prob = calculate_probabilities_from_disk('fourgrams.csv', total_fourgrams)



Task 1.1: Create n-grams for n=1, 2, 3, 4. You can show sample prints.

- Count the freuency of n-grams

In [None]:
# Memor efficient technique
import csv
def write_ngram_counts_to_disk(file_path, n, output_file):
    with open(file_path, 'r', encoding='utf-8') as f, open(output_file, 'w', newline='', encoding='utf-8') as out_f:
        writer = csv.writer(out_f)
        ngram_counts = {}
        total_count = 0
        chunk_size = 1000
        lines = []

        for i, line in enumerate(f):
            cleaned_line = clean_text(line)
            tokens = cleaned_line.split()
            lines.append(tokens)

            if (i + 1) % chunk_size == 0:
                for tokens in lines:
                    ngrams = create_ngrams(tokens, n)
                    for ngram in ngrams:
                        ngram_counts[ngram] = ngram_counts.get(ngram, 0) + 1
                        total_count += 1
                writer.writerows(ngram_counts.items())
                ngram_counts.clear()
                lines = []
                print(f"Processed {i + 1} lines...")

        if lines:
            for tokens in lines:
                ngrams = create_ngrams(tokens, n)
                for ngram in ngrams:
                    ngram_counts[ngram] = ngram_counts.get(ngram, 0) + 1
                    total_count += 1
            writer.writerows(ngram_counts.items())
            ngram_counts.clear()

    return total_count

file_path = '/content/drive/MyDrive/Amharic_Corpus/Copy_of_GPAC.txt'

total_unigrams = write_ngram_counts_to_disk(file_path, 1, 'unigrams.csv')
total_bigrams = write_ngram_counts_to_disk(file_path, 2, 'bigrams.csv')
total_trigrams = write_ngram_counts_to_disk(file_path, 3, 'trigrams.csv')
total_fourgrams = write_ngram_counts_to_disk(file_path, 4, 'fourgrams.csv')

print(f"Total unigrams: {total_unigrams}")
print(f"Total bigrams: {total_bigrams}")
print(f"Total trigrams: {total_trigrams}")
print(f"Total fourgrams: {total_fourgrams}")
