In [1]:
import numpy as np
import nltk
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from sklearn.model_selection import train_test_split
from IPython import display

In [157]:
class Language_Model:
    def __init__(self, corpus, V):
        self.corpus = corpus
        self.V = V
        self.max_ngram = 3
        self.Count_unigram = {}
        self.Count_bigram = {}
        self.Count_trigram = {}
        self.n_grams_counts()

    def n_grams_counts(self):
        start = []
        end = ["</s>"]
        for i in range(max(1,self.max_ngram-1)):
            start.append("<s>")

        for tweet in self.corpus:
            words = start+tweet+end 
            for index, word in enumerate(words[self.max_ngram-1:]):
                context = [word]
                if word not in self.Count_unigram: 
                    self.Count_unigram[word] = 0
                self.Count_unigram[word]+=1

                for j in range(1,self.max_ngram): #Create n_gram
                    context.append(words[(index + self.max_ngram -1) - j])
                    if len(context) == 2:
                        if self.Count_bigram.get(str(context)) is not None: self.Count_bigram[str(context)] += 1
                        else: self.Count_bigram[str(context)] = 1
                    elif len(context) == 3:
                        if self.Count_trigram.get(str(context)) is not None: self.Count_trigram[str(context)] += 1
                        else: self.Count_trigram[str(context)] = 1
    
    def Laplace(self,cnt1, cnt2):
        a = cnt1 + 1
        b = cnt2 + len(self.V)
        return a/b

    def get_probability(self,words):
        context = str(words[:-1])
        size = len(words)
        words = str(words)
        cnt1 = self.Count_unigram[words] if size == 1 and words in self.Count_unigram else 0
        cnt1 = self.Count_bigram[words] if size == 2 and words in self.Count_bigram else 0
        cnt1 = self.Count_trigram[words] if size == 3 and words in self.Count_trigram else 0
        
        cnt2 = self.Count_unigram[context] if size == 1 and context in self.Count_unigram else 0
        cnt2 = self.Count_bigram[context] if size == 2 and context in self.Count_bigram else 0
        
        return self.Laplace(cnt1, cnt2)

    def P_n_gram_sequence(self, sequence, n): #bigram,unigram,trigram
        if n < 1: return
        start = []
        end = ["</s>"]
        for i in range(max(1,n-1)):
            start.append("<s>")
        words = start+sequence+end
        print(words)
        final_p = 1
        for i in range(len(words[n-1:])):
            context = words[i:i+n]
            print(context)
            p = self.get_probability(context)
            final_p *= p

        return final_p
    
    def perplexity(self,probabilities, N):
        p = np.array(probabilities)
        return 2**(-1/N * np.sum(np.log2(p) ))

    def interpolation(self,validation, lambdas):
        start = []
        end = ["</s>"]
        for i in range(max(1,self.max_ngram-1)):
            start.append("<s>")
        validation = start+validation+end
        N = len(validation)
        size = len(validation)
        probabilities = []
        for idx in range(size):
            context = []
            p = np.zeros(3,dtype="float")    
            for j, next_word in enumerate(validation[idx:idx+3]):
                context.append(next_word)
                p[j] = self.get_probability(context)
                print(context)
            final_p = p[0]*lambdas[0] + p[1]*lambdas[1] + p[2]*lambdas[2]
            probabilities.append(final_p)
            
        return self.perplexity(probabilities, N)

In [41]:
def get_tweets_from_file(path_corpus):
    tr_tweet = []    
    with open(path_corpus, "r") as f_corpus:
        for tweet in f_corpus:
            tr_tweet += [tweet]
    return tr_tweet

In [4]:
def define_vocabulary(corpus, size):
    corpus_words = []
    for doc in corpus:
        corpus_words += tokenizer.tokenize(doc)
    fdist = nltk.FreqDist(corpus_words)
    V = [(fdist[key], key) for key in fdist]
    V.sort() 
    V.reverse()
    V = V[:size]
    V = [word for count, word in V]
    return V

In [5]:
def preprocess_corpus(corpus, volcabulary_size):
    # Make sure all tweets are in lowercase.
    for i in range(len(corpus)):
        corpus[i]= corpus[i].lower()
    # Define vocabulary with the 500 most frequent words.
    V = define_vocabulary(corpus, volcabulary_size)
    # Rebuild corpus
    new_corpus = []
    for tweet in corpus:
        words = tokenizer.tokenize(tweet)    
        new_words = ["<unk>" if word not in V else word for word in words] # New lexicon with unknown words
        new_corpus.append(new_words)
    return new_corpus, V

In [6]:
tr_tweet = get_tweets_from_file("./mex_train.txt")
tr_tweet, V = preprocess_corpus(tr_tweet, 500)

In [74]:
model = Language_Model(tr_tweet,V)

In [81]:
print(tr_tweet[0])
model.P_n_gram_sequence(tr_tweet[0],3)

['lo', 'peor', 'de', 'todo', 'es', 'que', 'no', 'me', 'dan', 'por', 'un', 'tiempo', 'y', 'luego', '<unk>', 'estoy', 'hasta', 'la', 'verga', 'de', '<unk>']
['<s>', '<s>', 'lo', 'peor', 'de', 'todo', 'es', 'que', 'no', 'me', 'dan', 'por', 'un', 'tiempo', 'y', 'luego', '<unk>', 'estoy', 'hasta', 'la', 'verga', 'de', '<unk>', '</s>']
['<s>', '<s>', 'lo']
['<s>', 'lo', 'peor']
['lo', 'peor', 'de']
['peor', 'de', 'todo']
['de', 'todo', 'es']
['todo', 'es', 'que']
['es', 'que', 'no']
['que', 'no', 'me']
['no', 'me', 'dan']
['me', 'dan', 'por']
['dan', 'por', 'un']
['por', 'un', 'tiempo']
['un', 'tiempo', 'y']
['tiempo', 'y', 'luego']
['y', 'luego', '<unk>']
['luego', '<unk>', 'estoy']
['<unk>', 'estoy', 'hasta']
['estoy', 'hasta', 'la']
['hasta', 'la', 'verga']
['la', 'verga', 'de']
['verga', 'de', '<unk>']
['de', '<unk>', '</s>']


4.529848320000003e-58

In [82]:
model.Count_bigram

{"['lo', '<s>']": 48,
 "['peor', 'lo']": 12,
 "['de', 'peor']": 3,
 "['todo', 'de']": 11,
 "['es', 'todo']": 3,
 "['que', 'es']": 80,
 "['no', 'que']": 221,
 "['me', 'no']": 118,
 "['dan', 'me']": 13,
 "['por', 'dan']": 1,
 "['un', 'por']": 23,
 "['tiempo', 'un']": 3,
 "['y', 'tiempo']": 4,
 "['luego', 'y']": 20,
 "['<unk>', 'luego']": 9,
 "['estoy', '<unk>']": 31,
 "['hasta', 'estoy']": 32,
 "['la', 'hasta']": 83,
 "['verga', 'la']": 419,
 "['de', 'verga']": 28,
 "['<unk>', 'de']": 1266,
 "['</s>', '<unk>']": 1658,
 "['a', '<s>']": 118,
 "['la', 'a']": 391,
 "['<unk>', 'la']": 1044,
 "['no', '<unk>']": 339,
 "['seas', 'no']": 18,
 "['<unk>', 'seas']": 9,
 "['<unk>', '<unk>']": 5977,
 "['putos', '<unk>']": 168,
 "['minutos', 'putos']": 4,
 "['después', 'minutos']": 1,
 "['me', 'después']": 5,
 "['<unk>', 'me']": 878,
 "['que', '<unk>']": 970,
 "['<unk>', 'que']": 814,
 "['en', '<unk>']": 628,
 "['3', 'en']": 1,
 "['horas', '3']": 3,
 "['?', 'horas']": 1,
 "['<unk>', '?']": 97,
 "['<unk

In [83]:
tr_tweet = get_tweets_from_file("./mex_train.txt")
tr_tweet, V = preprocess_corpus(tr_tweet, 500)

In [102]:
tr_tweet_train, tr_tweet_test = train_test_split(tr_tweet, test_size = 0.2)
tr_tweet_test, tr_tweet_val = train_test_split(tr_tweet_test, test_size = 0.5)
val = []
for words_tweet in tr_tweet_val: 
    val += words_tweet

In [85]:
lambdas = [
    [1/3, 1/3, 1/3],
    [.4, .4, .2],
    [.2, .4, .4],
    [.5, .4, .1],
    [.1, .4, .5]
]

In [158]:
model = Language_Model(tr_tweet_train,V)

In [164]:
model.interpolation(val, lambdas[4])

['<s>']
['<s>', '<s>']
['<s>', '<s>', 'me']
['<s>']
['<s>', 'me']
['<s>', 'me', '<unk>']
['me']
['me', '<unk>']
['me', '<unk>', 'súper']
['<unk>']
['<unk>', 'súper']
['<unk>', 'súper', 'loca']
['súper']
['súper', 'loca']
['súper', 'loca', 'por']
['loca']
['loca', 'por']
['loca', 'por', '<unk>']
['por']
['por', '<unk>']
['por', '<unk>', 'triste']
['<unk>']
['<unk>', 'triste']
['<unk>', 'triste', 'por']
['triste']
['triste', 'por']
['triste', 'por', '<unk>']
['por']
['por', '<unk>']
['por', '<unk>', '<unk>']
['<unk>']
['<unk>', '<unk>']
['<unk>', '<unk>', 'y']
['<unk>']
['<unk>', 'y']
['<unk>', 'y', '<unk>']
['y']
['y', '<unk>']
['y', '<unk>', 'a']
['<unk>']
['<unk>', 'a']
['<unk>', 'a', 'twitter']
['a']
['a', 'twitter']
['a', 'twitter', 'y']
['twitter']
['twitter', 'y']
['twitter', 'y', 'me']
['y']
['y', 'me']
['y', 'me', 'doy']
['me']
['me', 'doy']
['me', 'doy', 'cuenta']
['doy']
['doy', 'cuenta']
['doy', 'cuenta', 'de']
['cuenta']
['cuenta', 'de']
['cuenta', 'de', 'que']
['de']
['de',

227.43181716576294