<a href="https://colab.research.google.com/github/Vayshu08/NLP/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from collections import defaultdict, Counter
import random

def train_bigram_model(corpus):
    # Tokenize the corpus into words
    tokens = corpus.split()

    # Create a list of tuples where each tuple is a bigram (word_i, word_i+1)
    bigrams = list(zip(tokens[:-1], tokens[1:]))

    # Create a dictionary to store the bigram frequencies
    bigram_freq = defaultdict(Counter)

    # Count occurrences of each bigram
    for prev_word, next_word in bigrams:
        bigram_freq[prev_word][next_word] += 1

    # Convert frequencies into probabilities
    bigram_model = defaultdict(dict)
    for prev_word, next_words in bigram_freq.items():
        total_count = sum(next_words.values())
        for next_word, count in next_words.items():
            bigram_model[prev_word][next_word] = count / total_count

    return bigram_model

def generate_text(bigram_model, seed_word, max_length=20):
    current_word = seed_word
    text = [current_word]

    for _ in range(max_length):
        if current_word in bigram_model:
            next_word_candidates = list(bigram_model[current_word].keys())
            next_word_probs = list(bigram_model[current_word].values())
            current_word = random.choices(next_word_candidates, weights=next_word_probs)[0]
            text.append(current_word)
        else:
            break

    return ' '.join(text)

# Example usage:
corpus = "this is a simple example of a corpus for bigram model implementation"
bigram_model = train_bigram_model(corpus)

# Print the trained bigram model
print("Bigram Model:")
for prev_word, next_words in bigram_model.items():
    for next_word, prob in next_words.items():
        print(f'P({next_word} | {prev_word}) = {prob:.4f}')

# Generate text using the bigram model
seed_word = "this"
generated_text = generate_text(bigram_model, seed_word)
print(f"\nGenerated Text starting with '{seed_word}':\n{generated_text}")


Bigram Model:
P(is | this) = 1.0000
P(a | is) = 1.0000
P(simple | a) = 0.5000
P(corpus | a) = 0.5000
P(example | simple) = 1.0000
P(of | example) = 1.0000
P(a | of) = 1.0000
P(for | corpus) = 1.0000
P(bigram | for) = 1.0000
P(model | bigram) = 1.0000
P(implementation | model) = 1.0000

Generated Text starting with 'this':
this is a corpus for bigram model implementation


In [None]:
from collections import defaultdict, Counter

def compute_bigram_probabilities(corpus):
    # Tokenize the corpus into words
    tokens = corpus.split()

    # Create a list of bigrams (word_i, word_i+1)
    bigrams = list(zip(tokens[:-1], tokens[1:]))

    # Count occurrences of each bigram
    bigram_counts = Counter(bigrams)

    # Compute probabilities
    bigram_probabilities = defaultdict(dict)
    total_bigrams = len(bigrams)

    for bigram, count in bigram_counts.items():
        prev_word, next_word = bigram
        bigram_probabilities[prev_word][next_word] = count / total_bigrams

    return bigram_probabilities

# Example usage:
corpus = "this is a simple example of a corpus for bigram probabilities computation"
bigram_probabilities = compute_bigram_probabilities(corpus)

# Print the computed bigram probabilities
print("Bigram Probabilities:")
for prev_word, next_words in bigram_probabilities.items():
    for next_word, prob in next_words.items():
        print(f'P({next_word} | {prev_word}) = {prob:.4f}')


Bigram Probabilities:
P(is | this) = 0.0909
P(a | is) = 0.0909
P(simple | a) = 0.0909
P(corpus | a) = 0.0909
P(example | simple) = 0.0909
P(of | example) = 0.0909
P(a | of) = 0.0909
P(for | corpus) = 0.0909
P(bigram | for) = 0.0909
P(probabilities | bigram) = 0.0909
P(computation | probabilities) = 0.0909


In [None]:
from collections import Counter

def train_unigram_model(corpus):
    # Tokenize the corpus into words
    tokens = corpus.split()

    # Count occurrences of each word
    word_counts = Counter(tokens)

    # Total number of words in the corpus
    total_words = len(tokens)

    # Calculate probabilities for each word
    unigram_model = {word: count / total_words for word, count in word_counts.items()}

    return unigram_model

def get_word_probability(unigram_model, word):
    # Get the probability of a single word
    return unigram_model.get(word, 0)

# Example usage:
corpus = "this is a simple example of a corpus for unigram model"
unigram_model = train_unigram_model(corpus)

# Print the trained unigram model
print("Unigram Model:")
for word, prob in unigram_model.items():
    print(f'P({word}) = {prob:.4f}')

# Calculate probability of a specific word
word_to_check = "example"
prob_example = get_word_probability(unigram_model, word_to_check)
print(f"\nProbability of '{word_to_check}': {prob_example:.4f}")


Unigram Model:
P(this) = 0.0909
P(is) = 0.0909
P(a) = 0.1818
P(simple) = 0.0909
P(example) = 0.0909
P(of) = 0.0909
P(corpus) = 0.0909
P(for) = 0.0909
P(unigram) = 0.0909
P(model) = 0.0909

Probability of 'example': 0.0909


In [None]:
def generate_trigrams(text):
    words = text.split()
    trigrams = {}

    for i in range(len(words) - 2):
        key = (words[i], words[i + 1])
        value = words[i + 2]
        if key in trigrams:
            trigrams[key].append(value)
        else:
            trigrams[key] = [value]

    return trigrams

def generate_text(trigrams, length=50):
    start = list(trigrams.keys())[0]  # start with the first trigram in the text
    text = list(start)

    while len(text) < length:
        last_two_words = tuple(text[-2:])
        if last_two_words in trigrams:
            next_word = trigrams[last_two_words][0]  # choose the first word from the list of possible next words
            text.append(next_word)
        else:
            break

    return ' '.join(text)

# Example usage:
if __name__ == '__main__':
    text = "A trigram is a sequence of three consecutive words in a text. This is an example of a trigram generator."
    trigrams = generate_trigrams(text)
    generated_text = generate_text(trigrams)
    print(generated_text)


A trigram is a sequence of three consecutive words in a text. This is an example of a trigram generator.


In [None]:
from collections import defaultdict
import random

class WordPredictor:
    def __init__(self, n_gram=2):
        self.n_gram = n_gram
        self.n_grams = defaultdict(list)

    def train(self, text):
        words = text.split()
        for i in range(len(words) - self.n_gram):
            n_gram_tuple = tuple(words[i:i + self.n_gram])
            next_word = words[i + self.n_gram]
            self.n_grams[n_gram_tuple].append(next_word)

    def predict_next_word(self, text):
        words = text.split()
        n = len(words)

        if n < self.n_gram:
            return "Not enough words to predict"

        n_gram_prefix = tuple(words[-self.n_gram:])

        if n_gram_prefix in self.n_grams:
            next_words = self.n_grams[n_gram_prefix]
            return random.choice(next_words)
        else:
            return "No prediction available"

# Example usage:
if __name__ == '__main__':
    predictor = WordPredictor(n_gram=2)

    # Example text for training
    text = "Natural language processing is a subfield of artificial intelligence."
    predictor.train(text)

    # Example of predicting the next word
    input_text = "Natural language processing"
    predicted_word = predictor.predict_next_word(input_text)
    print(f"Input text: '{input_text}'")
    print(f"Predicted next word: {predicted_word}")


Input text: 'Natural language processing'
Predicted next word: is
