### Preparing the data for calculations

In [None]:
pip install requests



In [None]:
import requests
import pandas as pd
import numpy as np
import nltk
import re
import math
from collections import defaultdict
from nltk.tokenize import wordpunct_tokenize
from nltk.lm import MLE, Laplace, StupidBackoff
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
url = "https://raw.githubusercontent.com/Rubina15Parveen/NgramsLanguageModel/59c2ee7e5abf9606ed8ba7de92079a5bf49c3662/corpus_for_language_model.txt"

In [None]:
response = requests.get(url)
if response.status_code == 200:
  corpus = response.text.splitlines()
  print(corpus[0])

Richard W. Lock , retired vice president and treasurer of Owens-Illinois Inc. , was named a director of this transportation industry supplier , increasing its board to six members . 


In [None]:
tokenized_sentences = [wordpunct_tokenize(sentence) for sentence in corpus]
#tokenized_sentences[0]

### 1.A : Maximum Likelihood Estimation using the trigram model

In [None]:
n = 3
train_data, vocab = padded_everygram_pipeline(n, tokenized_sentences)
model1 = MLE(n)
model1.fit(train_data, vocab)

In [None]:
S1 = "Senator Byrd is chairman of the Appropriations Committee ."
S2 = "His appointment expands the board to 13 members ."
S1_tokens = list(pad_both_ends(wordpunct_tokenize(S1), n))
S2_tokens = list(pad_both_ends(wordpunct_tokenize(S2), n))
print(S1_tokens)
print(S2_tokens)

['<s>', '<s>', 'Senator', 'Byrd', 'is', 'chairman', 'of', 'the', 'Appropriations', 'Committee', '.', '</s>', '</s>']
['<s>', '<s>', 'His', 'appointment', 'expands', 'the', 'board', 'to', '13', 'members', '.', '</s>', '</s>']


### Probability using the MLE model rounded of to nearest integer :

In [None]:
def log_probabilities(model, sentence):
    log_prob = 0.0
    for i in range(len(sentence) - n + 1):
        ngram = tuple(sentence[i:i+n])
        log_prob += model.logscore(ngram[-1], ngram[:-1])
    return log_prob

log_prob_S1 = log_probabilities(model1, S1_tokens)
log_prob_S2 = log_probabilities(model1, S2_tokens)
print(f"Sentence 1 Probability : {log_prob_S1}".format(log_prob_S1))
print(f"Sentence 2 Probability : {log_prob_S2}".format(log_prob_S2))

Sentence 1 Probability : -19.446068605660464
Sentence 2 Probability : -18.383636799547986


### 1B: Adding one Laplace Smoothening

In [None]:

# Define n-gram order (trigrams) and prepare training data
tokenized_sentences = [wordpunct_tokenize(sentence) for sentence in corpus]
n = 3
train_data, vocab = padded_everygram_pipeline(n, tokenized_sentences)

# Initialize and fit the Laplace model (Add-One Smoothing)
model2 = Laplace(n)
model2.fit(train_data, vocab)

# Calculate log probabilities using Laplace smoothing
log_prob_S1_Laplace = log_probabilities(model2, S1_tokens)
log_prob_S2_Laplace = log_probabilities(model2, S2_tokens)

# Output log probabilities (rounded)
print(f"Laplace - Sentence 1 Log Probability: {round(log_prob_S1_Laplace)}")
print(f"Laplace - Sentence 2 Log Probability: {round(log_prob_S2_Laplace)}")


Laplace - Sentence 1 Log Probability: -107
Laplace - Sentence 2 Log Probability: -109


### 1B: With Katz back off smoothing

In [None]:
# Define a function for log probabilities using the StupidBackoff model
tokenized_sentences = [wordpunct_tokenize(sentence) for sentence in corpus]
n = 3
train_data, vocab = padded_everygram_pipeline(n, tokenized_sentences)

# Initialize and fit the StupidBackoff model with the n-gram order (3 in this case)
alpha = 0.4  # Katz back-off alpha
model3 = StupidBackoff(alpha, n)  # Specify the n-gram order

# Fit the model using the same training data and vocabulary
model3.fit(train_data, vocab)

# Ensure proper fallback happens in case trigrams are not found
log_prob_S1_backoff = log_probabilities(model3, S1_tokens)
log_prob_S2_backoff = log_probabilities(model3, S2_tokens)

# Output the results
print(f"Katz Backoff - Sentence 1 Log Probability: {log_prob_S1_backoff}")
print(f"Katz Backoff - Sentence 2 Log Probability: {log_prob_S2_backoff}")


Katz Backoff - Sentence 1 Log Probability: -19.446068605660464
Katz Backoff - Sentence 2 Log Probability: -18.383636799547986


In [None]:
S3 = "But , in this context , that 's the smart thing to do . ''"
S4 = "It has n't made merger overtures to the board . "
S3_tokens = list(pad_both_ends(wordpunct_tokenize(S3), n))
S4_tokens = list(pad_both_ends(wordpunct_tokenize(S4), n))
log_prob_S3 = log_probabilities(model1, S3_tokens)
log_prob_S4 = log_probabilities(model1, S4_tokens)
print(f"Sentence 1 Probability : {log_prob_S3}".format(log_prob_S3))
print(f"Sentence 2 Probability : {log_prob_S4}".format(log_prob_S4))

Sentence 1 Probability : -28.78331761074023
Sentence 2 Probability : -22.507625516823442


In [None]:
S5= "W.R. Grace holds three of Grace Energy 's seven board seats ."
S6= "Mervin Lung remains chairman and chief executive officer ."
S5_tokens = list(pad_both_ends(wordpunct_tokenize(S5), n))
S6_tokens = list(pad_both_ends(wordpunct_tokenize(S6), n))
log_prob_S5 = log_probabilities(model1, S5_tokens)
log_prob_S6 = log_probabilities(model1, S6_tokens)
print(f"Sentence 5 Probability : {log_prob_S5}".format(log_prob_S5))
print(f"Sentence 6 Probability : {log_prob_S6}".format(log_prob_S6))

Sentence 5 Probability : -27.046352016574026
Sentence 6 Probability : -15.639639105271593


### Question 2: Compute Positive Pointwise Mutual Information of pairs [word, context]

In [None]:
def calculate_ppmi(tokenized_sentences):
    # Calculate probabilities
    word_freq = defaultdict(int)
    context_freq = defaultdict(int)
    pair_freq = defaultdict(int)
    total_count = 0

    #Process each sentence
    for sentence in tokenized_sentences:
        for i, word in enumerate(sentence):
            # Get the context window 8
            left_context = sentence[max(0, i-8):i]
            right_context = sentence[i+1:i+9]
            context = ['NIL'] * (8 - len(left_context)) + left_context + right_context + ['NIL'] * (8 - len(right_context))

            # Count the frequencies
            word_freq[word] += 1
            for c_word in context:
                context_freq[c_word] += 1
                pair_freq[(word, c_word)] += 1
                total_count += 1
    return word_freq, context_freq, pair_freq, total_count

def calculate_ppmi_for_pair(target_word, context_word, word_freq, context_freq, pair_freq, total_count):
    #word_freq, context_freq, pair_freq, total_count = calculate_ppmi(tokenized_sentences)
    # Calculate probabilities
    prob_word = word_freq[target_word] / total_count
    prob_context = context_freq[context_word] / total_count
    prob_pair = pair_freq[(target_word, context_word)] / total_count

    # Calculate PMI
    if prob_pair > 0 and prob_word > 0 and prob_context > 0:
        pmi = math.log2(prob_pair / (prob_word * prob_context))
        ppmi = max(0, pmi)
        return ppmi
    else:
        return 0

# Calculate frequencies once
tokenized_sentences = [wordpunct_tokenize(sentence) for sentence in corpus]
word_freq, context_freq, pair_freq, total_count = calculate_ppmi(tokenized_sentences)

### Q2.A : PPMI of 1) executive and president 2) executive and said 3) company and of 4) company and said

In [None]:
#Q2.A.1. word = executive and context word = president
ppmi_value = calculate_ppmi_for_pair('executive', 'president', word_freq, context_freq, pair_freq, total_count)
print(f"PPMI for 'executive' and 'president' = {ppmi_value}")

PPMI for 'executive' and 'president' = 6.543534603619038


In [None]:
#Q2.A.2. word = executive and context word = said
target_word = 'executive'
context_word = 'said'
ppmi_value = calculate_ppmi_for_pair(target_word, context_word, word_freq, context_freq, pair_freq, total_count)
print(f"PPMI for '{target_word}' and '{context_word}' = {ppmi_value}")

PPMI for 'executive' and 'said' = 4.76653639984982


In [None]:
#Q2.A.1. word = company and context word = of
target_word = 'company'
context_word = 'of'
ppmi_value = calculate_ppmi_for_pair(target_word, context_word, word_freq, context_freq, pair_freq, total_count)
print(f"PPMI for '{target_word}' and '{context_word}' = {ppmi_value}")

PPMI for 'company' and 'of' = 4.3032070297415626


In [None]:
#Q2.A.1. word = company and context word = said
target_word = 'company'
context_word = 'said'
ppmi_value = calculate_ppmi_for_pair(target_word, context_word, word_freq, context_freq, pair_freq, total_count)
print(f"PPMI for '{target_word}' and '{context_word}' = {ppmi_value}")

PPMI for 'company' and 'said' = 4.448866737439181


### Q2.B : context words given as [said, of, president]. Calculate Cosine Similarity of words 1)sales and purchase 2) executive and company 3)executive and sales

In [None]:



# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0
    return dot_product / (norm_vec1 * norm_vec2)

# Function to calculate PPMI vectors for a given word and a set of context words
def calculate_ppmi_vectors(words, contexts):
    word_freq, context_freq, pair_freq, total_count = calculate_ppmi(tokenized_sentences)
    # Step 4: Calculate PPMI vectors for each word
    ppmi_vectors = {}
    for word in words:
        ppmi_vector = []
        p_word = word_freq[word] / total_count
        for context_word in contexts:
            p_context = context_freq[context_word] / total_count
            p_pair = pair_freq[(word, context_word)] / total_count

            # Calculate PMI
            if p_pair > 0 and p_word > 0 and p_context > 0:
                pmi = math.log2(p_pair / (p_word * p_context))
                ppmi = max(0, pmi)  # Convert PMI to PPMI
            else:
                ppmi = 0
            ppmi_vector.append(ppmi)
        ppmi_vectors[word] = ppmi_vector

    return ppmi_vectors

# Example usage
words = ['sales', 'purchase', 'executive', 'company']
contexts = ['said', 'of', 'president']

# Calculate PPMI vectors
ppmi_vectors = calculate_ppmi_vectors(words, contexts)

# (1) Cosine similarity for 'sales' and 'purchase'
cosine_sales_purchase = cosine_similarity(ppmi_vectors['sales'], ppmi_vectors['purchase'])
print(f"Cosine similarity (sales, purchase): {round(cosine_sales_purchase, 2)}")

# (2) Cosine similarity for 'executive' and 'company'
cosine_executive_company = cosine_similarity(ppmi_vectors['executive'], ppmi_vectors['company'])
print(f"Cosine similarity (executive, purchase): {round(cosine_executive_company, 2)}")

# (3) Cosine similarity for 'executive' and 'sales'
cosine_executive_sales = cosine_similarity(ppmi_vectors['executive'], ppmi_vectors['sales'])
print(f"Cosine similarity (executive, sales): {round(cosine_executive_sales, 2)}")


Cosine similarity (sales, purchase): 0.96
Cosine similarity (executive, purchase): 0.97
Cosine similarity (executive, sales): 1.0


### Q3: Word Vectors using Glove embedding

In [None]:
!wget http://nlp.stanford.edu/data/glove.42B.300d.zip
!unzip glove.42B.300d.zip


--2024-09-25 19:00:54--  http://nlp.stanford.edu/data/glove.42B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.42B.300d.zip [following]
--2024-09-25 19:00:54--  https://nlp.stanford.edu/data/glove.42B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip [following]
--2024-09-25 19:00:54--  https://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1877800501 (1.7G) [application/zip]


### Q3.A: Find most similar words using glove embedding 1) business 2)school 3) state

In [None]:
import numpy as np

# Load the GloVe vectors into a dictionary
def load_glove_embeddings(glove_file):
    embeddings_dict = {}
    with open(glove_file, 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Load the GloVe embeddings
glove_file = 'glove.42B.300d.txt'  # Update with your file path
embeddings_dict = load_glove_embeddings(glove_file)

# Retrieve the word embedding for "state"
business_vector = embeddings_dict["business"]

# Find the most similar word to "state" excluding "state" and "states"
most_similar_word = None
max_similarity = -1

for word, vector in embeddings_dict.items():
    if word in ["business", "businesss"]:
        continue
    similarity = cosine_similarity(business_vector, vector)
    if similarity > max_similarity:
        max_similarity = similarity
        most_similar_word = word

print(f"The most similar word to 'business' is: {most_similar_word}")


The most similar word to 'business' is: businesses


In [None]:
school_vector = embeddings_dict["school"]

# Find the most similar word to "business" excluding "business" and "businesss"
most_similar_word = None
max_similarity = -1

for word, vector in embeddings_dict.items():
    if word in ["school", "schools"]:
        continue
    similarity = cosine_similarity(school_vector, vector)
    if similarity > max_similarity:
        max_similarity = similarity
        most_similar_word = word

print(f"The most similar word to 'school' is: {most_similar_word}")

The most similar word to 'school' is: elementary


In [None]:
state_vector = embeddings_dict["state"]

most_similar_word = None
max_similarity = -1

for word, vector in embeddings_dict.items():
    if word in ["state", "states"]:
        continue
    similarity = cosine_similarity(state_vector, vector)
    if similarity > max_similarity:
        max_similarity = similarity
        most_similar_word = word

print(f"The most similar word to 'state' is: {most_similar_word}")

The most similar word to 'state' is: government


### Q3.B: Find analogies 1)paris is to france as madrid is to ____

In [None]:
paris_vector = embeddings_dict["paris"]
france_vector = embeddings_dict["france"]
madrid_vector = embeddings_dict["madrid"]
analogy_vector = madrid_vector + france_vector - paris_vector
most_similar_word = None
max_similarity = -1

for word, vector in embeddings_dict.items():
    if word in ["paris", "france", "madrid"]:
        continue
    similarity = cosine_similarity(analogy_vector, vector)
    if similarity > max_similarity:
        max_similarity = similarity
        most_similar_word = word

print(f"The word completing the analogy 'Paris is to France as Madrid is to X' is: {most_similar_word}")


The word completing the analogy 'Paris is to France as Madrid is to X' is: spain


In [None]:
import math
from collections import defaultdict, Counter
from nltk.tokenize import wordpunct_tokenize
from nltk.lm import MLE, Laplace, StupidBackoff
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends

def tokenize_sentences(sentences):
    return [wordpunct_tokenize(sentence) for sentence in sentences]

def get_context(tokens, index, window_size=8):
    left_context = tokens[max(0, index - window_size):index]
    right_context = tokens[index + 1:index + window_size + 1]

    left_context = ['NIL'] * (window_size - len(left_context)) + left_context
    right_context = right_context + ['NIL'] * (window_size - len(right_context))

    return left_context + right_context

def compute_co_occurrence(tokenized_sentences, window_size=8):
    co_occurrence = defaultdict(Counter)
    word_counts = Counter()
    for sentence in tokenized_sentences:
        for i, word in enumerate(sentence):
            context = get_context(sentence, i, window_size)
            co_occurrence[word].update(context)
            word_counts[word] += 1
    # print(co_occurrence)
    return co_occurrence, word_counts

def compute_ppmi(word, context_word, co_occurrence, word_counts, total_words):
    p_word = word_counts[word] / total_words
    p_context = word_counts[context_word] / total_words
    p_word_context = co_occurrence[word][context_word] / total_words


    if p_word_context == 0:
        return 0

    pmi = math.log2(p_word_context / (p_word * p_context))
    return max(0, pmi)

if __name__ == "__main__":

    tokenized_sentences = tokenize_sentences(corpus)

    co_occurrence, word_counts = compute_co_occurrence(tokenized_sentences)
    total_words = sum(word_counts.values())

    word_context_pairs = [
        ("executive", "president"),
        ("executive", "said"),
        ("company", "said"),
        ("company", "of")
    ]

    for word, context_word in word_context_pairs:
        ppmi = compute_ppmi(word, context_word, co_occurrence, word_counts, total_words)
        print(f"PPMI({word}, {context_word}) = {ppmi:.4f}")

PPMI(executive, president) = 6.3745
PPMI(executive, said) = 4.5779
PPMI(company, said) = 4.2603
PPMI(company, of) = 4.1597
