##Part2

In [8]:
import os.path
import sys
import random
import math
from collections import defaultdict

# Constants
UNK = "UNK"
start = "<s>"
end = "</s>"

#----------------------------------------
#  Data input
#----------------------------------------

def readFileToCorpus(f):
    if os.path.isfile(f):
        corpus = []
        with open(f, "r") as file:
            for line in file:
                corpus.append(line.split())
        return corpus
    else:
        print(f"Error: {f} does not exist")
        sys.exit()

def preprocess(corpus):
    freqDict = defaultdict(int)
    for sen in corpus:
        for word in sen:
            freqDict[word] += 1
    for sen in corpus:
        for i in range(len(sen)):
            if freqDict[sen[i]] < 2:
                sen[i] = UNK
    for sen in corpus:
        sen.insert(0, start)
        sen.append(end)
    return corpus

def preprocessTest(vocab, corpus):
    for sen in corpus:
        for i in range(len(sen)):
            if sen[i] not in vocab:
                sen[i] = UNK
    for sen in corpus:
        sen.insert(0, start)
        sen.append(end)
    return corpus

#----------------------------------------------
# Language Models
#----------------------------------------------

class LanguageModel:
    def __init__(self, corpus):
        pass

    def generateSentence(self):
        pass

    def getSentenceProbability(self, sen):
        pass

    def getCorpusPerplexity(self, corpus):
        log_sum = 0.0
        total_words = 0
        for sen in corpus:
            prob = self.getSentenceProbability(sen)
            if prob == 0:
                return float('inf')
            log_sum += math.log(prob, 2)
            total_words += len(sen) - 1  # exclude <s>
        if total_words == 0:
            return float('inf')
        return 2 ** (-log_sum / total_words)

    def generateSentencesToFile(self, n, filename):
        with open(filename, 'w') as f:
            for _ in range(n):
                sen = self.generateSentence()
                prob = self.getSentenceProbability(sen)
                f.write(f"{prob} {' '.join(sen)}\n")

class UnigramModel(LanguageModel):
    def __init__(self, corpus):
        self.counts=defaultdict(float)
        self.total=0.0
        for sen in corpus:
            for w in sen:#in case of unigrams start not counted
                if w==start:
                    continue
                self.counts[w]+= 1
                self.total+= 1
        self.vocab=list(self.counts.keys())

    def prob(self, word):
        return self.counts[word] / self.total if self.total > 0 else 0# count of words/total wors in corpous

    def generateSentence(self):
        sentence=[start]#initialized with <s>
        while True:
            rand=random.random()#between 0 and 1,used to select a word based on its prob
            for w in self.vocab:
                rand-=self.prob(w)#sub the prob of that word from rand
                if rand <= 0:
                    sentence.append(w)#that word is selected and appended to the sen
                    if w==end:#if <\s>,returns the completed sentence
                        return sentence
                    break

    def getSentenceProbability(self, sen):
        prob=1.0
        for w in sen[1:]:
            if self.counts[w]==0:
                return 0.0
            prob*=self.prob(w)
        return prob

class SmoothedUnigramModel(LanguageModel):
    def __init__(self, corpus):
        self.counts=defaultdict(float)
        self.total=0.0
        for sen in corpus:
            for w in sen:
                if w==start:
                    continue
                self.counts[w]+=1
                self.total+=1
        self.V=len(self.counts)

    def prob(self, word):#add 1 smoothing
        res= (self.counts.get(word, 0) + 1)/ (self.total + self.V)
        return res

    def generateSentence(self):
        sen=[start]
        while True:
            words=list(self.counts.keys()) + [end]
            rand=random.random()
            for w in words:
                p=self.prob(w)
                rand-=p
                if rand <=0:
                    sen.append(w)
                    if w==end:
                        return sen
                    break

    def getSentenceProbability(self, sen):
        prob=1.0
        for w in sen[1:]:
            prob*=self.prob(w)
        return prob

class SmoothedUnigramModel(LanguageModel):
    def __init__(self, corpus):
        self.counts=defaultdict(float)
        self.total=0.0
        for sen in corpus:
            for word in sen:
                if word==start:
                    continue
                self.counts[word]+=1
                self.total+=1
        self.V=len(self.counts)

    def prob(self, word):#add 1 smoothing
        return (self.counts.get(word, 0) + 1) / (self.total + self.V)

    def generateSentence(self):
        sentence=[start]
        while True:
            words=list(self.counts.keys()) + [end]
            rand=random.random()
            for word in words:
                p=self.prob(word)
                rand-=p
                if rand <=0:
                    sentence.append(word)
                    if word==end:
                        return sentence
                    break

    def getSentenceProbability(self, sen):
        prob=1.0
        for word in sen[1:]:
            prob*=self.prob(word)
        return prob

class BigramModel(LanguageModel):
    def __init__(self, corpus):
        self.bigram_counts=defaultdict(float)
        self.context_counts=defaultdict(float)
        for sen in corpus:
            for i in range(1, len(sen)):
                prev, curr=sen[i-1],sen[i]
                self.bigram_counts[(prev, curr)]+=1
                self.context_counts[prev]+=1

    def prob(self, prev, curr):
        if self.context_counts[prev]==0:
            return 0.0
        return self.bigram_counts.get((prev, curr), 0.0) / self.context_counts[prev]

    def generateSentence(self):
        sentence=[start]
        current=start
        while True:
            #collects all words that have ever been seen following the current word curr in your training data.
            next_words=[curr for (p, curr) in self.bigram_counts if p==current]
            if not next_words:
                sentence.append(end)
                break
            probs=[self.prob(current, w) for w in next_words]
            total=sum(probs)#rep the total prob mass for all possible continuations from curr
            if total==0:
                sentence.append(end)
                break
            rand=random.random() * total
            for i, w in enumerate(next_words):
                rand-=probs[i]
                if rand <=0:
                    current=w
                    sentence.append(w)
                    break
            if current==end:
                break
        return sentence

    def getSentenceProbability(self, sen):
        prob=1.0
        for i in range(1, len(sen)):
            prev, curr=sen[i-1], sen[i]
            p=self.prob(prev, curr)
            if p==0:
                return 0.0
            prob*=p
        return prob


class SmoothedBigramModelLI(LanguageModel):#linear interpolation
    def __init__(self, corpus):
        self.bigram_counts=defaultdict(float)
        self.context_counts=defaultdict(float)
        self.unigram_counts=defaultdict(float)
        self.total_uni=0.0
        for sen in corpus:
            for w in sen:#for all words
                if w!=start:
                    self.unigram_counts[w]+=1
                    self.total_uni+=1
            for i in range(1, len(sen)):
                prev, curr = sen[i-1], sen[i]
                self.bigram_counts[(prev, curr)]+=1
                self.context_counts[prev] += 1
        self.lam1=0.5#lambdas
        self.lam2=0.5

    def prob(self, prev, curr):
        p_bi=self.bigram_counts.get((prev, curr),0.0) / self.context_counts[prev] if self.context_counts[prev]>0 else 0.0
        p_uni=self.unigram_counts[curr] / self.total_uni if self.total_uni >0 else 0.0
        res= self.lam1*p_bi + self.lam2*p_uni
        return res

    def generateSentence(self):
        sen=[start]
        curr=start
        while True:
            possible_words=list(self.unigram_counts.keys()) + [end]
            probs=[self.prob(curr, w) for w in possible_words]
            total=sum(probs)
            if total==0:
                sen.append(end)
                break
            probs=[p / total for p in probs]
            rand=random.random()
            for i, w in enumerate(possible_words):
                rand-=probs[i]
                if rand <=0:
                    curr=w
                    sen.append(w)
                    break
            if curr==end:
                break
        return sen

    def getSentenceProbability(self, sen):
        prob=1.0
        for i in range(1, len(sen)):
            prev,curr=sen[i-1], sen[i]
            prob*=self.prob(prev, curr)
        return prob

#-------------------------------------------
# Main
#-------------------------------------------
if __name__ == "__main__":
    trainCorpus = readFileToCorpus('train.txt')
    trainCorpus = preprocess(trainCorpus)

    vocab = set()
    for sen in trainCorpus:
        for word in sen:
            vocab.add(word)

    posTestCorpus = readFileToCorpus('pos_test.txt')
    negTestCorpus = readFileToCorpus('neg_test.txt')
    posTestCorpus = preprocessTest(vocab, posTestCorpus)
    negTestCorpus = preprocessTest(vocab, negTestCorpus)

    #models training
    uni_model = UnigramModel(trainCorpus)
    print("done")
    smooth_uni_model = SmoothedUnigramModel(trainCorpus)
    print("done")
    bi_model = BigramModel(trainCorpus)
    print("done")
    smooth_bi_model = SmoothedBigramModelLI(trainCorpus)
    print("done")

    #sentences generation
    uni_model.generateSentencesToFile(20, 'unigram_output.txt')
    print("done")
    smooth_uni_model.generateSentencesToFile(20, 'smooth_unigram_output.txt')
    print("done")
    bi_model.generateSentencesToFile(20, 'bigram_output.txt')
    print("done")
    smooth_bi_model.generateSentencesToFile(20, 'smooth_bi_li_output.txt')
    print("done")


    models = [uni_model, smooth_uni_model, bi_model, smooth_bi_model]
    model_names = ['Unigram', 'SmoothedUnigram', 'Bigram', 'SmoothedBigramLI']
    for model, name in zip(models, model_names):
        pos_ppl = model.getCorpusPerplexity(posTestCorpus)
        neg_ppl = model.getCorpusPerplexity(negTestCorpus)
        print(f"{name} pos perplexity: {pos_ppl}")
        print(f"{name} neg perplexity: {neg_ppl}")
'''
1-

The unigram model’s sentence length is controlled solely by the chance of drawing </s>, while bigram models
condition on the previous word, leading to more context-driven and realistic sentence boundaries
2-
Unsmoothed models assign zero probability to unseen n-grams, leading to drastic differences.
Smoothed models spread probability mass to unseen events,leading to more reasonable sentence probabilities.thus
resulting in less drastic differences,

3-
The smoothed bigram model produces more realistic sentences by handling unseen bigrams through
interpolation, avoiding abrupt endings.
4-
Perplexities:

Unigram: pos= 628.67,, neg=612.26
SmoothedUnigram: pos=631.56, neg=615.36
Bigram: pos=inf, neg=inf (due to OOV or zero prob bigrams)
SmoothedBigramLI: pos=243.50, neg=251.22

'''


done
done
done
done
done
done
done
done
Unigram pos perplexity: 628.6696007319457
Unigram neg perplexity: 612.2628559648674
SmoothedUnigram pos perplexity: 631.5562305194113
SmoothedUnigram neg perplexity: 615.3626467739131
Bigram pos perplexity: inf
Bigram neg perplexity: inf
SmoothedBigramLI pos perplexity: 243.5032538798665
SmoothedBigramLI neg perplexity: 251.21906135625858


'\n1-\n \nThe unigram model’s sentence length is controlled solely by the chance of drawing </s>, while bigram models \ncondition on the previous word, leading to more context-driven and realistic sentence boundaries\n2-\nUnsmoothed models assign zero probability to unseen n-grams, leading to drastic differences.\nSmoothed models spread probability mass to unseen events,leading to more reasonable sentence probabilities.thus\nresulting in less drastic differences,  \n\n3-\nThe smoothed bigram model produces more realistic sentences by handling unseen bigrams through \ninterpolation, avoiding abrupt endings.\n4-\nPerplexities:\n\nUnigram: pos= 628.67,, neg=612.26\nSmoothedUnigram: pos=631.56, neg=615.36\nBigram: pos=inf, neg=inf (due to OOV or zero prob bigrams)\nSmoothedBigramLI: pos=243.50, neg=251.22\n\n'