## Question 1

In [267]:
from collections import Counter, defaultdict
class BPE:
    def __init__(self, corpus, merge_size):
        self.corpus = corpus
        self.merge_size = merge_size
        self.word_freqs = defaultdict(int)
        self.splits = {}
        self.merges = {}


    def learn_vocabulary(self):
        # Compute the frequencies of each word in the corpus
        for text in self.corpus:
            words = text.split(" ")
            for word in words:
                self.word_freqs[word] += 1

        # Compute the base vocabulary of all characters in the corpus
        alphabet = []
        for word in self.word_freqs.keys():
            for letter in word:
                if letter not in alphabet:
                    alphabet.append(letter)
        alphabet.sort()

        # Add the special token </w> at the beginning of the vocabulary
        vocab = ["</w>"] + alphabet.copy()

        # Split each word into individual characters before training
        self.splits = {word: [c for c in word] for word in self.word_freqs.keys()}

        # Merge the most frequent pair iteratively until the vocabulary size is reached
        while len(self.merges) < self.merge_size:
            print(len(self.merges),self.merge_size)
            # Compute the frequency of each pair
            pair_freqs = self.compute_pair_freqs()

            # Find the most frequent pair
            best_pair = max(pair_freqs, key=pair_freqs.get)

            # Merge the most frequent pair
            self.splits = self.merge_pair(*best_pair)
            self.merges[best_pair] = best_pair[0] + best_pair[1]
            vocab.append(best_pair[0] + best_pair[1])

        with open('token.txt', 'w') as f:
            for item in vocab:
                f.write(item + '\n')
        with open("merge_rules.txt", 'w') as f:
            print(len(self.merges))
            for item in self.merges:
                f.write(item[0] + ',' + item[1] + ' = ' + self.merges[item] + '\n')
        return self.merges

    def compute_pair_freqs(self):
        pair_freqs = defaultdict(int)
        for word, freq in self.word_freqs.items():
            split = self.splits[word]
            if len(split) == 1:
                continue
            for i in range(len(split) - 1):
                pair = (split[i], split[i + 1])
                pair_freqs[pair] += freq
        # print(pair_freqs)
        return pair_freqs


    def merge_pair(self, a, b):
        for word in self.word_freqs:
            split = self.splits[word]
            if len(split) == 1:
                continue
            i = 0
            while i < len(split) - 1:
                if split[i] == a and split[i + 1] == b:
                    split = split[:i] + [a + b] + split[i + 2 :]
                else:
                    i += 1
            self.splits[word] = split
        return self.splits


    def tokenize(self, text):
        pre_tokenized_text = text.split(" ")
        splits_text = [[l for l in word] for word in pre_tokenized_text]

        for pair, merge in self.merges.items():
            for idx, split in enumerate(splits_text):
                i = 0
                while i < len(split) - 1:
                    if split[i] == pair[0] and split[i + 1] == pair[1]:
                        split = split[:i] + [merge] + split[i + 2 :]
                    else:
                        i += 1
                splits_text[idx] = split
        result = sum(splits_text, [])
        with open('tokenized.txt', 'w') as f:
            for item in result:
                f.write(item + '\n')
        return result

In [269]:

with open('trail.txt', encoding="utf8") as f:
    corpus = f.readlines()

# set the hyperparameter of vocabulary size
merge_size = 80

# create a BPE tokenizer object
bpe = BPE(corpus=corpus, merge_size=merge_size)

bpe.learn_vocabulary()

text_to_Test = '''i stand here i feel empty a class post count link href http mooshilu
i literally just text tychelle to see if she wants to hang out because reading what i just wrote about my nonexistent social life made me feel so pathetic
i really feel regretful when hearing that shinae got married to another man oh it s really sad i really hope that alex and shinae can be a couple in real life they re perfect for each other
i believed it was true love and feel devastated i wanted to settle down and have the whole marriage and kids thing with him
i feel unimportant so inadequate
i feel very low already'''
bpe.tokenize(text_to_Test)


0 80
defaultdict(<class 'int'>, {('T', 'e'): 2, ('e', 's'): 5, ('s', 't'): 2, ('s', 'a'): 2, ('a', 'm'): 2, ('m', 'p'): 2, ('p', 'l'): 4, ('l', 'e'): 3, ('N', 'a'): 1, ('a', 't'): 3, ('t', 'u'): 1, ('u', 'r'): 1, ('r', 'a'): 2, ('a', 'l'): 2, ('l', 'a'): 3, ('a', 'n'): 5, ('n', 'g'): 6, ('g', 'u'): 2, ('u', 'a'): 2, ('a', 'g'): 2, ('g', 'e'): 3, ('p', 'r'): 1, ('r', 'o'): 2, ('o', 'c'): 1, ('c', 'e'): 1, ('s', 's'): 1, ('s', 'i'): 1, ('i', 'n'): 5, ('t', 'e'): 3, ('e', 'c'): 2, ('c', 'h'): 1, ('h', 'n'): 1, ('n', 'i'): 1, ('i', 'q'): 1, ('q', 'u'): 1, ('u', 'e'): 1, ('a', 'y'): 1, ('c', 'r'): 1, ('r', 'u'): 1, ('u', 'c'): 1, ('c', 'i'): 1, ('i', 'a'): 1, ('o', 'l'): 1, ('m', 'o'): 1, ('o', 'd'): 2, ('d', 'e'): 2, ('e', 'r'): 1, ('r', 'n'): 1, ('A', 'I'): 1, ('a', 'p'): 1, ('p', 'p'): 1, ('l', 'i'): 2, ('i', 'c'): 1, ('c', 'a'): 1, ('t', 'i'): 2, ('i', 'o'): 1, ('o', 'n'): 1, ('n', 's'): 1, ('s', '\n'): 1, ('B', 'y'): 1, ('y', 't'): 1, ('P', 'a'): 1, ('a', 'i'): 1, ('i', 'r'): 1, ('E', 

['i',
 's',
 't',
 'and',
 'h',
 'e',
 'r',
 'e',
 'i',
 'f',
 'e',
 'e',
 'l',
 'e',
 'm',
 'p',
 't',
 'y',
 'a',
 'c',
 'l',
 'a',
 's',
 's',
 'p',
 'o',
 's',
 't',
 'c',
 'o',
 'u',
 'n',
 't',
 'l',
 'in',
 'k',
 'h',
 'r',
 'ef',
 'h',
 't',
 't',
 'p',
 'm',
 'o',
 'o',
 's',
 'h',
 'i',
 'l',
 'u',
 '\n',
 'i',
 'l',
 'i',
 'te',
 'r',
 'al',
 'l',
 'y',
 'j',
 'u',
 's',
 't',
 'te',
 'x',
 't',
 't',
 'y',
 'c',
 'h',
 'e',
 'l',
 'l',
 'e',
 't',
 'o',
 's',
 'e',
 'e',
 'i',
 'f',
 's',
 'h',
 'e',
 'w',
 'an',
 't',
 's',
 't',
 'o',
 'h',
 'ang',
 'o',
 'u',
 't',
 'b',
 'e',
 'c',
 'a',
 'u',
 's',
 'e',
 'r',
 'e',
 'a',
 'd',
 'ing',
 'w',
 'h',
 'at',
 'i',
 'j',
 'u',
 's',
 't',
 'w',
 'ro',
 'te',
 'a',
 'b',
 'o',
 'u',
 't',
 'm',
 'y',
 'n',
 'o',
 'n',
 'e',
 'x',
 'is',
 'te',
 'n',
 't',
 's',
 'o',
 'c',
 'i',
 'al',
 'l',
 'i',
 'f',
 'e',
 'm',
 'a',
 'd',
 'e',
 'm',
 'e',
 'f',
 'e',
 'e',
 'l',
 's',
 'o',
 'p',
 'at',
 'h',
 'e',
 't',
 'i',
 'c',
 '

## Question 2

In [244]:
import numpy as np
from collections import defaultdict

class BigramLM:
    def __init__(self, vocabulary):
        self.vocabulary = vocabulary
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.unigram_counts = defaultdict(int)
        self.total_bigrams = 0

    def learn_model(self, dataset):
        for sentence in dataset:
            sentence = ["<s>"] + sentence + ["</s>"]
            for i in range(len(sentence) - 1):
                self.bigram_counts[sentence[i]][sentence[i+1]] += 1
                self.unigram_counts[sentence[i]] += 1
                self.total_bigrams += 1

    def laplace_smoothing(self, bigram, alpha=1):
        numerator = self.bigram_counts[bigram[0]][bigram[1]] + alpha
        denominator = self.unigram_counts[bigram[0]] + (len(self.vocabulary) * alpha)
        return numerator / denominator

    def kneser_ney_smoothing(self, bigram, discount=0.75):
        prefix_count = self.unigram_counts[bigram[0]]
        continuation_count = len(self.bigram_counts[bigram[0]])

        # Kneser-Ney smoothing formula
        prob = max(self.bigram_counts[bigram[0]][bigram[1]] - discount, 0) / prefix_count
        prob += (discount / prefix_count) * continuation_count * (1 / self.total_bigrams)
        return prob

if __name__ == "__main__":
    # Example usage
    training_data = [
        ["I", "love", "programming"],
        ["Programming", "is", "fun"],
        ["Machine", "learning", "is", "exciting"]
    ]

    vocabulary = set(word for sentence in training_data for word in sentence)

    bigram_lm = BigramLM(vocabulary)
    bigram_lm.learn_model(training_data)

    test_bigram = ("Programming", "is")

    laplace_prob = bigram_lm.laplace_smoothing(test_bigram)
    kneser_ney_prob = bigram_lm.kneser_ney_smoothing(test_bigram)

    print(f"Laplace Smoothing Probability: {laplace_prob}")
    print(f"Kneser-Ney Smoothing Probability: {kneser_ney_prob}")


Laplace Smoothing Probability: 0.2
Kneser-Ney Smoothing Probability: 0.3076923076923077


## Why kneser ney is better than laplace smoothing
Kneser ney is uses discounting method to reduce the probability mass assigned to frequent bigrams and redistribute it to unseen bigrams. This is better than laplace smoothing because laplace smoothing assigns equal probability to all unseen bigrams which is not a good way to estimate the probability of unseen bigrams.

In [245]:
from utils import emotion_scores

print(emotion_scores("I am happy"))

[{'label': 'sadness', 'score': 0.0005438437219709158}, {'label': 'joy', 'score': 0.998437225818634}, {'label': 'love', 'score': 0.0004561925888992846}, {'label': 'anger', 'score': 0.00026224859175272286}, {'label': 'fear', 'score': 0.0001431150158168748}, {'label': 'surprise', 'score': 0.00015750851889606565}]


In [2]:
import pandas as pd

In [33]:
sentences = []
labels = []
with open('corpus.txt', encoding="utf8") as f:
    for line in f:
        sentences.append(line.strip())

with open('labels.txt', encoding="utf8") as f:
    for line in f:
        labels.append(line.strip())

df = pd.DataFrame({'sentences': sentences, 'labels': labels})
# print(df.head())
# sample 20 for each label
# print(df['labels'].value_counts())
df_sampled = df.groupby('labels').apply(lambda x: x.sample(40)).reset_index(drop=True)
# choose only anger joy and sadness
# df_sampled = df[df['labels'].isin(['anger'])]
# print(df_sampled.head())
print(df_sampled.head(1200))
# print number of samples for each label
# print(df_sampled['labels'].value_counts())

# save the sampled data in a trail.txt
with open('trail.txt', 'w', encoding="utf8") as f:
    for index, row in df_sampled.iterrows():
        f.write(row['sentences'] + '\n')

                                             sentences    labels
0                      i have a feeling i shall go mad     anger
1    i have a feeling this is going to be really lo...     anger
2    i feel that we are heading for an abyss that h...     anger
3    im gradually feeling a little irritated with h...     anger
4             i still feel incredibly frustrated by it     anger
..                                                 ...       ...
235  i always feel like i need drugs after which is...  surprise
236        i was cut into feeling pain that shocked me  surprise
237  i still feel so amazed knowing i stood right i...  surprise
238  i hardly feel they have any wow factor at all ...  surprise
239  im not quite sure why and she treated me well ...  surprise

[240 rows x 2 columns]


In [39]:
import numpy as np
import pickle
from utils import emotion_scores


import warnings
warnings.filterwarnings("ignore")

class BigramLM:
    def __init__(self, file, smoothing = 2):

        self.tokens = set()
        self.data = ""

        with open(file, 'r') as fil:
            data = fil.read()
        lin_data = data.split("\n")
        for sntc in lin_data:
            fsntc = f'<s> {sntc} <e> '
            self.data += fsntc
            self.tokens.update(fsntc.split())

        self.tok_id = {token: index for index, token in enumerate(self.tokens)}

        self.id_tok = {index: token for token, index in self.tok_id.items()}
        self.tokenized = self.get_tokens(self.data)

        self.coeff_mat, self.sad_mat, self.joy_mat, self.lov_mat, self.ang_mat, self.fea_mat, self.sup_mat = self.gen_matrix(self.tokenized)

        if(smoothing == 1):
            self.prob_mat = self.laplace_mat()
        elif(smoothing == 2):
            self.prob_mat = self.kneser_ney_mat()
        else:
            self.prob_mat = self.create_mat()


    def get_tokens(self, seq):
        toks = seq.split()
        return [self.tok_id[tok] for tok in toks]

    def get_seq(self, ids):
        tokid = [self.id_tok[id] for id in ids]
        sent = " ".join(tokid)
        return sent

    def create_mat(self):
        return self.coeff_mat/self.coeff_mat.sum(1, keepdims=True)

    def laplace_mat(self):
        prob_mt = self.coeff_mat + 1
        return prob_mt/prob_mt.sum(1, keepdims=True)

    def kneser_ney_mat(self, d = 0.75):
        prob_mt = self.coeff_mat
        frst_trm = np.maximum(prob_mt - d, 0)/prob_mt.sum(1, keepdims=True)
        lmbd_trm = (d/prob_mt.sum(axis = 1,keepdims=True)) * np.count_nonzero(prob_mt, axis=1, keepdims=True)
        cont = np.count_nonzero(prob_mt, axis=0, keepdims=True)
        pcnt_trm = cont/ cont.sum()
        return frst_trm + lmbd_trm * pcnt_trm

    def genrate(self, emotion, size = 10):
        idx = self.get_tokens("<s>")[0]
        generated_sequence = ""
        i = 0
        if emotion == 'sad':
            mat = self.sad_mat
        elif emotion == 'joy':
            mat = self.joy_mat
        elif emotion == 'love':
            mat = self.lov_mat
        elif emotion == 'angry':
            mat = self.ang_mat
        elif emotion == 'fear':
            mat = self.fea_mat
        elif emotion == 'surprise':
            mat = self.sup_mat
        else:
            mat = np.zeros_like(self.coeff_mat)

        curr_mat = self. prob_mat + mat
        curr_mat /= curr_mat.sum(1, keepdims=True)

        while self.get_seq([idx]) != "<e>":
            if i == size:
                generated_sequence += "."
                break
            elif(i != 0):
                generated_sequence += self.get_seq([idx]) + " "
            p = curr_mat[idx]
            idx = int(np.random.choice(len(p), p=p))
            i += 1
        return generated_sequence

    def gen_matrix(self, tokens):
        max_tkn = max(tokens) + 1
        coeff_mat = np.zeros((max_tkn, max_tkn), dtype=np.float64)
        sad_mat = np.zeros((max_tkn, max_tkn), dtype=np.float64)
        joy_mat = np.zeros((max_tkn, max_tkn), dtype=np.float64)
        lov_mat = np.zeros((max_tkn, max_tkn), dtype=np.float64)
        ang_mat = np.zeros((max_tkn, max_tkn), dtype=np.float64)
        fea_mat = np.zeros((max_tkn, max_tkn), dtype=np.float64)
        sup_mat = np.zeros((max_tkn, max_tkn), dtype=np.float64)
        for t1, t2 in zip(tokens, tokens[1:]):
            coeff_mat[t1, t2] += 1
            if(coeff_mat[t1, t2] == 1):
                emote = emotion_scores(self.get_seq([t1,t2]))
                # check index for which score is max
                scores = [emote[i]['score'] for i in range(6)]
                max_idx = scores.index(max(scores))
                # if max_idx ==0:
                    # sad_mat[t1, t2] = 3
                # if max_idx ==1:
                    # joy_mat[t1, t2] = 2
                if max_idx ==3:
                    ang_mat[t1, t2] = 3
                sad_mat[t1, t2] += emote[0]['score']
                joy_mat[t1, t2] += emote[1]['score']
                lov_mat[t1, t2] = emote[2]['score']
                ang_mat[t1, t2] += emote[3]['score']
                fea_mat[t1, t2] = emote[4]['score']
                sup_mat[t1, t2] = emote[5]['score']
        return coeff_mat, sad_mat, joy_mat, lov_mat, ang_mat, fea_mat, sup_mat



    def save(self, filename = "bigram_mdl.pkl"):
        with open(filename, 'wb') as f:
            pickle.dump(self, f)

    def get_emotion_score(self, size = -1, emotion = None):
        sntnc = self.genrate(emotion, size = size)
        emt_scr = emotion_scores(sntnc)
        emote = max(emt_scr, key=lambda x: x['score'])
        return sntnc, emote["label"], emote["score"]





def load(filename = "bigram_mdl.pkl"):
        with open(filename, 'rb') as f:
            obj = pickle.load(f)
        return obj



# bgram = load()
#

# bgram = BigramLM("corpus.txt")
bgram = BigramLM("trail.txt")
bgram.save()
# emotions = ["sad", "joy", "love", "anger", "fear", "surprise"]
# test_labels = []
# for i in emotions:
#     for j in range(50):
#         test_labels.append(i)
# test_corpus = []
# for i in test_labels:
#     sent, label, score = bgram.get_emotion_score(size = 50, emotion=i)
#     # print(score)
#     test_corpus.append(sent[14:])


In [35]:
bgram = load()

emotions = ["sadness", "joy", "love", "anger", "fear", "surprise"]
# emotions = ['anger']
test_labels = []
for i in emotions:
    for j in range(50):
        test_labels.append(i)
test_corpus = []
count_low_conf = [0 for i in range(6)]
for i in test_labels:
    sent, label, score = bgram.get_emotion_score(size = 10, emotion=i)
    if score<0.99:
        count_low_conf[emotions.index(label)] += 1
    test_corpus.append(sent[14:])
a = len(set(test_corpus))
print(len(test_corpus))
print(a)
for i in range(6):
    print(emotions[i],count_low_conf[i])





300
267
sadness 7
joy 68
love 21
anger 48
fear 20
surprise 23


In [36]:
test_corp = []
test_label = []
count = [0 for i in range(6)]
for i in range(len(test_corpus)):
    if test_corpus[i] not in test_corp:
        test_corp.append(test_corpus[i])
        test_label.append(test_labels[i])
    else:
        count[emotions.index(test_labels[i])] += 1
test_corpus = test_corp
test_labels = test_label
a = len(set(test_corpus))
print(len(test_corpus))
print(len(test_labels))
print(a)
for i in range(6):
    print(emotions[i],count[i])

267
267
267
sadness 1
joy 15
love 5
anger 4
fear 2
surprise 6


In [43]:
test_corpus = []
with open('Generated_anger_KneserNey.txt') as f:
    for line in f:
        test_corpus.append(line)
with open('Generated_fear_KneserNey.txt') as f:
    for line in f:
        test_corpus.append(line)
with open('Generated_joy_KneserNey.txt') as f:
    for line in f:
        test_corpus.append(line)
with open('Generated_love_KneserNey.txt') as f:
    for line in f:
        test_corpus.append(line)
with open('Generated_sadness_KneserNey.txt') as f:
    for line in f:
        test_corpus.append(line)
with open('Generated_surprise_KneserNey.txt') as f:
    for line in f:
        test_corpus.append(line)
test_labels = ['anger']*50 + ['fear']*50 + ['joy']*50 + ['love']*50 + ['sadness']*50 + ['surprise']*50

print(test_labels)



['anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'anger', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'fear', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'j

In [44]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

X_train = []
with open('corpus.txt', encoding="utf8") as f:
    for line in f:
        X_train.append(line)
y_train = []
with open('labels.txt', encoding="utf8") as f:
    for line in f:
        y_train.append(line)
x_test = test_corpus
y_test = test_labels
# x_test = []
# with open('generated_sentences.txt', encoding="utf8") as f:
#     for line in f:
#         x_test.append(line)
# y_test = []
# with open('target_emotions.txt', encoding="utf8") as f:
#     for line in f:
#         y_test.append(line)
# Initialize TF-IDF vectorizer
# X_train, x_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer()

# Fit and transform on the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(x_test)
# Define parameter grid for Grid Search
# param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}
#
# # Initialize SVC
svc = SVC()
#
# # Perform Grid Search with 5-fold cross-validation
# grid_search = GridSearchCV(svc, param_grid, cv=5, n_jobs=-1)
# grid_search.fit(X_train_tfidf, y_train)
#
# # Get the best parameters
# best_params = grid_search.best_params_
# print("Best Parameters:", best_params)
# Train SVC with the best parameters obtained from Grid Search
best_svc = SVC(C=10, gamma=0.1,kernel='rbf')
best_svc.fit(X_train_tfidf, y_train)
# Predict using the trained model
y_pred = best_svc.predict(X_test_tfidf)
y_pred = [i.strip() for i in y_pred]

# print(y_test)
# Print classification report
print(classification_report(y_test, y_pred))

print(accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

       anger       0.67      0.32      0.43        50
        fear       0.61      0.46      0.52        50
         joy       0.47      0.80      0.59        50
        love       0.90      0.90      0.90        50
     sadness       0.68      0.86      0.76        50
    surprise       0.90      0.72      0.80        50

    accuracy                           0.68       300
   macro avg       0.70      0.68      0.67       300
weighted avg       0.70      0.68      0.67       300

0.6766666666666666
