In [None]:
import pandas as pd

# Load the parquet file
df = pd.read_parquet("tokenized_gujarati_sentences.parquet")

# Print schema / column names
print("Columns in dataset:", df.columns.tolist())

# Print first few rows
print("\nHead of dataset:")
print(len(df))

# If one of the columns contains tokenized words, check type
for col in df.columns:
    print(f"\nColumn: {col}")
    print(df[col].iloc[0])
    print("Type:", type(df[col].iloc[0]))


Columns in dataset: ['sentence']

Head of dataset:
14811

Column: sentence
આ વીડિયો જુઓ : ઊંઝા માર્કેટયાર્ડ આજથી 25 જુલાઈ સુધી બંધ
Type: <class 'str'>


In [None]:
import csv
import pandas as pd
from collections import Counter, defaultdict

# Load dataset
df = pd.read_parquet("tokenized_gujarati_sentences.parquet")

# Extract sentences
sentences = df["sentence"].tolist()

# --- Step 1: Tokenization ---
# For Gujarati, simple whitespace split works as a baseline
tokenized_sentences = [s.strip().split() for s in sentences]

# --- Step 2: Function to build n-gram models ---
def build_ngram_model(sentences, n):
    ngram_counts = Counter()
    context_counts = Counter()

    for tokens in sentences:
        tokens = ["<s>"] * (n-1) + tokens + ["</s>"]  # padding
        ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

        ngram_counts.update(ngrams)

        if n > 1:
            contexts = [ng[:-1] for ng in ngrams]  # all (n-1)-grams
            context_counts.update(contexts)       # count every occurrence

    # Convert counts to probabilities
    model = defaultdict(dict)
    total_unigrams = sum(ngram_counts.values())

    for ngram, count in ngram_counts.items():
        if n == 1:
            model[()][ngram[0]] = count / total_unigrams
        else:
            context = ngram[:-1]
            word = ngram[-1]
            model[context][word] = count / context_counts[context]  # <-- Correct formula

    return model


# --- Step 3: Build models ---
unigram_model = build_ngram_model(tokenized_sentences, 1)
bigram_model = build_ngram_model(tokenized_sentences, 2)
trigram_model = build_ngram_model(tokenized_sentences, 3)
quadrigram_model = build_ngram_model(tokenized_sentences, 4)

# --- Step 4: Print samples ---

# Function to save n-gram model into CSV
def save_ngram_to_csv(model, ngram_type, filename):
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Ngram_Type", "Context", "Word", "Probability"])

        for context, next_words in model.items():
            for word, prob in next_words.items():
                writer.writerow([ngram_type, " ".join(context), word, prob])


# Save each model separately
save_ngram_to_csv(unigram_model, "Unigram", "unigram_probs.csv")
save_ngram_to_csv(bigram_model, "Bigram", "bigram_probs.csv")
save_ngram_to_csv(trigram_model, "Trigram", "trigram_probs.csv")
save_ngram_to_csv(quadrigram_model, "Quadrigram", "quadrigram_probs.csv")

print("✅ All n-gram probabilities saved into CSV files successfully!")



✅ All n-gram probabilities saved into CSV files successfully!


In [None]:
import pandas as pd
import random
from collections import Counter
import math

# reproducible sampling
random.seed(42)

# -------------------------------
# Load dataset
# -------------------------------
file_path = "/content/tokenized_gujarati_sentences.parquet"
df = pd.read_parquet(file_path)

# -------------------------------
# Ensure we have a tokens column (list of tokens per sentence)
# -------------------------------
if "tokens" not in df.columns:
    # split on whitespace; filter out empty tokens
    df["tokens"] = df["sentence"].astype(str).apply(lambda s: [t for t in s.split() if t.strip() != ""])

# list of token lists
sentences = df["tokens"].tolist()

# -------------------------------
# Take 1000 random sentences (or fewer if dataset smaller)
# -------------------------------
sample_sentences = random.sample(sentences, min(1000, len(sentences)))

# -------------------------------
# Build bigram (and unigram) counts using the FULL dataset
# -------------------------------
def build_bigram_counts(sentences_list):
    unigram_counts = Counter()
    bigram_counts = Counter()
    for sent in sentences_list:
        tokens = ["<s>"] + sent + ["</s>"]
        for i in range(len(tokens) - 1):
            unigram_counts[tokens[i]] += 1
            bigram_counts[(tokens[i], tokens[i+1])] += 1
        # add last token count (</s>)
        unigram_counts[tokens[-1]] += 1
    return unigram_counts, bigram_counts

unigram_counts, bigram_counts = build_bigram_counts(sentences)  # full data
V = len(unigram_counts)  # vocabulary size (number of token types)
total_tokens = sum(unigram_counts.values())

print("Vocabulary size (V):", V)
print("Total tokens:", total_tokens)

# -------------------------------
# Smoothing Functions
# -------------------------------
def add_one_prob(w1, w2):
    # Laplace (add-1)
    return (bigram_counts[(w1, w2)] + 1) / (unigram_counts[w1] + V)

def add_k_prob(w1, w2, k=0.5):
    # Add-k smoothing
    return (bigram_counts[(w1, w2)] + k) / (unigram_counts[w1] + k * V)

def add_type_prob(w1, w2):
    # Token-Type smoothing variant (may not sum to 1)
    return (bigram_counts[(w1, w2)] + 1) / (unigram_counts[w1] + len(bigram_counts))

# -------------------------------
# Sentence probability (log-sum for numeric stability)
# -------------------------------
def sentence_prob(sentence_tokens, method="add_one", k=0.5, tiny=1e-12):
    tokens = ["<s>"] + sentence_tokens + ["</s>"]
    log_prob = 0.0
    for i in range(len(tokens) - 1):
        w1, w2 = tokens[i], tokens[i + 1]
        if method == "add_one":
            prob = add_one_prob(w1, w2)
        elif method == "add_k":
            prob = add_k_prob(w1, w2, k)
        elif method == "add_type":
            prob = add_type_prob(w1, w2)
        else:
            raise ValueError("Unknown smoothing method: " + str(method))
        # avoid log(0)
        if prob <= 0:
            prob = tiny
        log_prob += math.log(prob)
    # return both log-prob and prob (prob may underflow to 0 for long sentences)
    return log_prob, math.exp(log_prob) if log_prob > -700 else 0.0

# -------------------------------
# Example: compute for first 5 sampled sentences
# -------------------------------
for sent in sample_sentences[:100]:
    sent_str = " ".join(sent)
    lp1, p1 = sentence_prob(sent, method="add_one")
    lpk, pk = sentence_prob(sent, method="add_k", k=0.5)
    lpt, pt = sentence_prob(sent, method="add_type")
    print("Sentence:", sent_str)
    print(" Add-One: log-prob =", lp1, ", prob =", p1)
    print(" Add-K  : log-prob =", lpk, ", prob =", pk)
    print(" Add-Type: log-prob =", lpt, ", prob =", pt)
    print("-" * 60)

# -------------------------------
# (Optional) Average log-prob or average prob over all sampled sentences
# -------------------------------
def avg_logprob_over_samples(samples, method="add_one", k=0.5):
    total_log = 0.0
    for s in samples:
        lp, _ = sentence_prob(s, method=method, k=k)
        total_log += lp
    return total_log / len(samples)

print("Average log-prob (Add-One) over sample:", avg_logprob_over_samples(sample_sentences, "add_one"))
print("Average log-prob (Add-K k=0.5) over sample:", avg_logprob_over_samples(sample_sentences, "add_k", k=0.5))
print("Average log-prob (Add-Type) over sample:", avg_logprob_over_samples(sample_sentences, "add_type"))


Vocabulary size (V): 40240
Total tokens: 258501
Sentence: જોબ કૌભાંડને લઈ હવે કૃષિ ભવનમાં પણ ફેક ઈન્ટરવ્યું !
 Add-One: log-prob = -104.8519590444298 , prob = 2.906516349506758e-46
 Add-K  : log-prob = -100.2980604562403 , prob = 2.7612504410973723e-44
 Add-Type: log-prob = -119.32316112441916 , prob = 1.5087231714070694e-52
------------------------------------------------------------
Sentence: આ દિવસ એ ગુરુ સ્મૃતિનો દિવસ છે .
 Add-One: log-prob = -64.84997429051965 , prob = 6.855103465307023e-29
 Add-K  : log-prob = -60.69384966860029 , prob = 4.3751807893637616e-27
 Add-Type: log-prob = -76.27522629975766 , prob = 7.483247548781064e-34
------------------------------------------------------------
Sentence: ભાજપે ૩ ટિકિટ માગણીદારો તથા એક MLAને પ્રવક્તા નીમતા ભારે આશ્ચર્ય
 Add-One: log-prob = -118.86466095062664 , prob = 2.3863473347504386e-52
 Add-K  : log-prob = -114.17397842898801 , prob = 2.5993884887097656e-50
 Add-Type: log-prob = -134.68705798152365 , prob = 3.2074059819107316e-5

In [None]:
# Q3: Smoothed Language Models on 1000 sentences

import pandas as pd
import random
import math
from collections import Counter, defaultdict

# ------------------------------
# Step 1: Load dataset
# ------------------------------
file_path = "/content/tokenized_gujarati_sentences.parquet"
df = pd.read_parquet(file_path)

# Assume the column with sentences is "sentence"
sentences = df["sentence"].dropna().tolist()

# ------------------------------
# Step 2: Randomly sample 1000 sentences
# ------------------------------
sampled_sentences = random.sample(sentences, min(1000, len(sentences)))

print(f"Total sampled sentences: {len(sampled_sentences)}")

# ------------------------------
# Step 3: Tokenization (basic whitespace split)
# ------------------------------
tokenized_sentences = [["<s>"] + s.split() + ["</s>"] for s in sampled_sentences]

# ------------------------------
# Step 4: Build Unigram & Bigram counts
# ------------------------------
unigram_counts = Counter()
bigram_counts = Counter()

for sent in tokenized_sentences:
    unigram_counts.update(sent)
    bigram_counts.update([(sent[i], sent[i+1]) for i in range(len(sent)-1)])

V = len(unigram_counts)   # Vocabulary size
print("Vocabulary size:", V)

# ------------------------------
# Step 5: Laplace Smoothed Bigram Probability
# ------------------------------
def bigram_prob(w1, w2):
    # (count(w1,w2) + 1) / (count(w1) + V)
    return (bigram_counts[(w1, w2)] + 1) / (unigram_counts[w1] + V)

# ------------------------------
# Step 6: Compute Sentence Probability
# ------------------------------
def sentence_probability(sentence):
    words = ["<s>"] + sentence.split() + ["</s>"]
    prob_log = 0.0
    for i in range(len(words)-1):
        prob = bigram_prob(words[i], words[i+1])
        prob_log += math.log(prob)  # log probability
    return math.exp(prob_log)       # return actual probability

# ------------------------------
# Step 7: Example Output
# ------------------------------
# Print probabilities for ALL 1000 sentences
results = []
for s in sampled_sentences:
    p = sentence_probability(s)
    results.append((s, p))

# Show first 10 for sanity check
for i in range(10):
    print(f"Sentence: {results[i][0]}")
    print(f"Probability: {results[i][1]:.10e}\n")

# Save all results to CSV
pd.DataFrame(results, columns=["sentence", "probability"]).to_csv("sentence_probabilities.csv", index=False)
print("✅ Saved all 1000 sentence probabilities to sentence_probabilities.csv")



Total sampled sentences: 1000
Vocabulary size: 6278
Sentence: બોટાદના હડદડ ગામે આવેલ પ્રાથમિક શાળામાં વારંવારની નેતાઓ તથા ધારાસભ્યો પાસે માંગણી કરવા છતાં ધોરણ 9 10 મંજૂરી મળી નથી રહી .
Probability: 2.3716557078e-78

Sentence: નવરાત્રી નજીક આવતાની સાથે જ કુંભાર સવારે વહેલા ઉઠી ચાકડા ઉપર માટીનો પીંડો ચડાવી ગરબા બનાવવા લાગી જાય છે .
Probability: 6.4402904172e-68

Sentence: જો પૃષ્ઠભૂમિની તપાસ તમારા ભૂતકાળમાં વર્તણૂકો છતી કરે છે જે છેવટે અયોગ્યતાના અર્થમાં હશે , તો તમારું પહેલું પગલું એ શોધવાનું છે કે તે શું છે જે તમને ભાડે લેવાથી રાખવામાં આવે છે .
Probability: 4.2076058187e-111

Sentence: ગ્રાહક કલ્યાણ ભંડોળમાં 16.58 કરોડ રૂપિયા જમા
Probability: 1.8241338270e-28

Sentence: કોર્ટે મંગળવારે સુષ્મિતાને ચૂંટણી પંચના નિર્ણયનો રેકોર્ડ દાખલ કરવાની મંજૂરી આપી હતી .
Probability: 7.0518342874e-45

Sentence: હવે પૂરી બનાવવા માટે મેંદાના લોટને એક બાઉલમાં લઇ તેમાં ઘઉંનો લોટ , નમક અને તેલ ઉમેરી મિક્ષ કરી લો .
Probability: 9.8898727277e-74

Sentence: આ રાશિના લોકો પર માં કાળીની કૃપા દૃષ્ટિ બની છે , જેના