<a href="https://colab.research.google.com/github/Pravallikabesi/NLP_LAB/blob/main/ngram_models_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from collections import Counter

def unigram_model(corpus):
    words = " ".join(corpus).lower().split()
    counts = Counter(words)
    total = sum(counts.values())
    return {w: c / total for w, c in counts.items()}

# Example
corpus = [
    "I love natural language processing",
    "Language models are fun to build",
    "I love building language models"
]

print(unigram_model(corpus))


{'i': 0.125, 'love': 0.125, 'natural': 0.0625, 'language': 0.1875, 'processing': 0.0625, 'models': 0.125, 'are': 0.0625, 'fun': 0.0625, 'to': 0.0625, 'build': 0.0625, 'building': 0.0625}


In [None]:
from collections import Counter

def simple_bigram_model(corpus):
    words = " ".join(corpus).lower().split()
    bigrams = [(words[i], words[i+1]) for i in range(len(words) - 1)]
    bigram_counts = Counter(bigrams)
    word_counts = Counter(words)

    model = {}
    for (w1, w2), count in bigram_counts.items():
        if w1 not in model:
            model[w1] = {}
        model[w1][w2] = count / word_counts[w1]
    return model

# Example
corpus = [
    "I love natural language processing",
    "Language models are fun to build",
    "I love building language models"
]

print(simple_bigram_model(corpus))

{'i': {'love': 1.0}, 'love': {'natural': 0.5, 'building': 0.5}, 'natural': {'language': 1.0}, 'language': {'processing': 0.3333333333333333, 'models': 0.6666666666666666}, 'processing': {'language': 1.0}, 'models': {'are': 0.5}, 'are': {'fun': 1.0}, 'fun': {'to': 1.0}, 'to': {'build': 1.0}, 'build': {'i': 1.0}, 'building': {'language': 1.0}}


In [None]:
from collections import defaultdict

def trigram_model(corpus):
    words = " ".join(corpus).lower().split()
    trigrams = [(words[i], words[i+1], words[i+2]) for i in range(len(words) - 2)]
    counts = defaultdict(lambda: defaultdict(int))
    for w1, w2, w3 in trigrams:
        counts[(w1, w2)][w3] += 1

    model = defaultdict(lambda: defaultdict(float))
    for (w1, w2) in counts:
        total_w1_w2 = sum(counts[(w1, w2)].values())
        for w3 in counts[(w1, w2)]:
            model[(w1, w2)][w3] = counts[(w1, w2)][w3] / total_w1_w2
    return model

# Example
corpus = [
    "I love natural language processing",
    "Language models are fun to build",
    "I love building language models"
]

print(trigram_model(corpus))

defaultdict(<function trigram_model.<locals>.<lambda> at 0x7d4a64482a20>, {('i', 'love'): defaultdict(<class 'float'>, {'natural': 0.5, 'building': 0.5}), ('love', 'natural'): defaultdict(<class 'float'>, {'language': 1.0}), ('natural', 'language'): defaultdict(<class 'float'>, {'processing': 1.0}), ('language', 'processing'): defaultdict(<class 'float'>, {'language': 1.0}), ('processing', 'language'): defaultdict(<class 'float'>, {'models': 1.0}), ('language', 'models'): defaultdict(<class 'float'>, {'are': 1.0}), ('models', 'are'): defaultdict(<class 'float'>, {'fun': 1.0}), ('are', 'fun'): defaultdict(<class 'float'>, {'to': 1.0}), ('fun', 'to'): defaultdict(<class 'float'>, {'build': 1.0}), ('to', 'build'): defaultdict(<class 'float'>, {'i': 1.0}), ('build', 'i'): defaultdict(<class 'float'>, {'love': 1.0}), ('love', 'building'): defaultdict(<class 'float'>, {'language': 1.0}), ('building', 'language'): defaultdict(<class 'float'>, {'models': 1.0})})


In [None]:
import nltk

# Download the necessary resources
nltk.download('punkt')
nltk.download('punkt_tab')

# Now you can use word_tokenize and sent_tokenize without errors
from nltk.tokenize import word_tokenize, sent_tokenize

# Sample text
text = "Hello! I'm learning Natural Language Processing. It's fun."

# Sentence tokenization
sentences = sent_tokenize(text)
print("Sentences:", sentences)

# Word tokenization
words = word_tokenize(text)
print("Words:", words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Sentences: ['Hello!', "I'm learning Natural Language Processing.", "It's fun."]
Words: ['Hello', '!', 'I', "'m", 'learning', 'Natural', 'Language', 'Processing', '.', 'It', "'s", 'fun', '.']
