In [58]:
import numpy as np
import nltk
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from sklearn.model_selection import train_test_split
from IPython import display
import random
import operator
from itertools import permutations
import itertools
import pandas as pd
from itertools import repeat

In [7]:
class Language_Model:
    def __init__(self, corpus, V):
        self.corpus = corpus
        self.V = V
        self.max_ngram = 3
        self.Count_unigram = {}
        self.Count_bigram = {}
        self.Count_trigram = {}
        self.P_unigram = {}
        self.P_bigram = {}
        self.P_trigram = {}
        self.n_grams_counts()

    def n_grams_counts(self):
        start = []
        end = ['</s>']
        for i in range(max(1,self.max_ngram)):
            start.append('<s>')
            self.Count_unigram[str(start)] = len(self.corpus) 
        self.Count_unigram[str(end)] = len(self.corpus)
        for tweet in self.corpus:
            words = start+tweet+end 
            for index, word in enumerate(words[self.max_ngram-1:]):
                context = [word]
                if self.Count_unigram.get(str([word])) == None: 
                    self.Count_unigram[str([word])] = 0
                self.Count_unigram[str([word])]+=1

                for j in range(1,self.max_ngram): #Create n_gram
                    context.append(words[(index + self.max_ngram -1) - j])
                    aux_context = context.copy()
                    aux_context.reverse()
                    if len(aux_context) == 2:
                        if self.Count_bigram.get(str(aux_context)) is not None: self.Count_bigram[str(aux_context)] += 1
                        else: self.Count_bigram[str(aux_context)] = 1
                    elif len(aux_context) == 3:
                        if self.Count_trigram.get(str(aux_context)) is not None: self.Count_trigram[str(aux_context)] += 1
                        else: self.Count_trigram[str(aux_context)] = 1
                    #print(aux_context)
    
    def Laplace(self,cnt1, cnt2):
        a = cnt1 + 1
        b = cnt2 + len(self.V)
        return a/b

    def get_probability(self,words):
        context = words[:-1]
        size = len(words)
        size_context = len(context)

        words = str(words)
        context = str(context)
        cnt1 = 0

        if size == 1: cnt1 = self.Count_unigram.get(words)
        if size == 2: cnt1 = self.Count_bigram.get(words)
        if size == 3: cnt1 = self.Count_trigram.get(words)
        if(cnt1 == None):cnt1 = 0

        cnt2 = 0
        if size_context == 1: cnt2 = self.Count_unigram.get(context)
        if size_context == 2: cnt2 = self.Count_bigram.get(context)
        if (cnt2 == None):cnt2 = 0
        
        return self.Laplace(cnt1, cnt2)

    def P_n_gram_sequence(self, sequence, n): #bigram,unigram,trigram
        if n < 1: return
        start = []
        end = ["</s>"]
        for i in range(max(1,n-1)):
            start.append("<s>")
        words = start+sequence+end
        #print(words)
        final_p = 1
        for i in range(len(words[n-1:])):
            context = words[i:i+n]
            #print(context)
            p = self.get_probability(context)
            final_p *= p

        return final_p
    
    def perplexity(self,probabilities, N):
        p = np.array(probabilities)
        return 2**(-1/N * np.sum(np.log2(p) ))

    def interpolation(self,validation, lambdas):
        start = []
        end = ["</s>"]
        for i in range(max(1,self.max_ngram-1)):
            start.append("<s>")
        validation = start+validation+end
        N = len(validation)
        size = len(validation)
        probabilities = []
        for idx in range(size):
            context = []
            p = np.zeros(3,dtype="float")    
            for j, next_word in enumerate(validation[idx:idx+3]):
                context.append(next_word)
                p[j] = self.get_probability(context)
                #print(context)
                if len(context) == 1:
                    self.P_unigram[str(context)] = p[j] 
                elif len(context) == 2:
                    self.P_bigram[str(context)] = p[j] 
                elif len(context) == 3:
                    self.P_trigram[str(context)] = p[j] 

            final_p = p[0]*lambdas[0] + p[1]*lambdas[1] + p[2]*lambdas[2]
            probabilities.append(final_p)
            
        return self.perplexity(probabilities, N)
    
    def tweetear(self, lambdas, stop = 50):
        tweet = ["<s>","<s>"]
        while len(tweet) < stop:
            p_next_word = []
            words = []
            for next_word in self.V:
                tweet.append(next_word)
                p = np.zeros(3,dtype="float") 
                for i in range(3):
                    context = tweet[len(tweet) - i - 1:]  
                    p[i] = self.get_probability(context)
                tweet.pop()   
                p_next_word.append(p[0]*lambdas[0] + p[1]*lambdas[1] + p[2]*lambdas[2])    
                words.append(next_word)
            p_next_word = np.array(p_next_word,dtype = "float")
            suma = np.sum(p_next_word)
            p_next_word /= suma
            n_word = np.random.choice(words, 1, p=p_next_word)[0]
            tweet.append(n_word)
        return tweet
            

In [8]:
def get_tweets_from_file(path_corpus):
    tr_tweet = []    
    with open(path_corpus, "r") as f_corpus:
        for tweet in f_corpus:
            tr_tweet += [tweet]
    return tr_tweet

In [9]:
def define_vocabulary(corpus, size):
    corpus_words = []
    for doc in corpus:
        corpus_words += tokenizer.tokenize(doc)
    fdist = nltk.FreqDist(corpus_words)
    V = [(fdist[key], key) for key in fdist]
    V.sort() 
    V.reverse()
    V = V[:size]
    V = [word for count, word in V]
    return V

In [10]:
def preprocess_corpus(corpus, volcabulary_size):
    # Make sure all tweets are in lowercase.
    for i in range(len(corpus)):
        corpus[i]= corpus[i].lower()
    # Define vocabulary with the 500 most frequent words.
    V = define_vocabulary(corpus, volcabulary_size)
    # Rebuild corpus
    new_corpus = []
    for tweet in corpus:
        words = tokenizer.tokenize(tweet)    
        new_words = ["<unk>" if word not in V else word for word in words] # New lexicon with unknown words
        new_corpus.append(new_words)
    return new_corpus, V

In [11]:
tr_tweet = get_tweets_from_file("./mex_train.txt")
tr_tweet, V = preprocess_corpus(tr_tweet, 5000)
V.append("<unk>")

In [12]:
model = Language_Model(tr_tweet,V)

In [13]:
model.Count_bigram

{"['<s>', '<s>']": 5544,
 "['<s>', 'lo']": 48,
 "['lo', 'peor']": 12,
 "['peor', 'de']": 3,
 "['de', 'todo']": 11,
 "['todo', 'es']": 3,
 "['es', 'que']": 80,
 "['que', 'no']": 221,
 "['no', 'me']": 118,
 "['me', 'dan']": 13,
 "['dan', 'por']": 1,
 "['por', 'un']": 23,
 "['un', 'tiempo']": 3,
 "['tiempo', 'y']": 4,
 "['y', 'luego']": 20,
 "['luego', 'vuelven']": 1,
 "['vuelven', 'estoy']": 1,
 "['estoy', 'hasta']": 32,
 "['hasta', 'la']": 83,
 "['la', 'verga']": 419,
 "['verga', 'de']": 28,
 "['de', '<unk>']": 422,
 "['<unk>', '</s>']": 605,
 "['<s>', 'a']": 118,
 "['a', 'la']": 391,
 "['la', 'vga']": 1,
 "['vga', 'no']": 1,
 "['no', 'seas']": 18,
 "['seas', 'mamón']": 2,
 "['mamón', '45']": 1,
 "['45', 'putos']": 1,
 "['putos', 'minutos']": 4,
 "['minutos', 'después']": 1,
 "['después', 'me']": 5,
 "['me', 'dices']": 5,
 "['dices', 'que']": 6,
 "['que', 'apenas']": 1,
 "['apenas', 'sales']": 1,
 "['sales', 'no']": 1,
 "['me', 'querías']": 1,
 "['querías', 'avisar']": 1,
 "['avisar', '

In [14]:
print(tr_tweet[0])
model.P_n_gram_sequence(tr_tweet[0],3)

['lo', 'peor', 'de', 'todo', 'es', 'que', 'no', 'me', 'dan', 'por', 'un', 'tiempo', 'y', 'luego', 'vuelven', 'estoy', 'hasta', 'la', 'verga', 'de', '<unk>']


2.1523527390822563e-67

In [15]:
tr_tweet = get_tweets_from_file("./mex_train.txt")
tr_tweet, V = preprocess_corpus(tr_tweet, 700)
V.append("<unk>")

In [16]:
tr_tweet_train, tr_tweet_test = train_test_split(tr_tweet, test_size = 0.2)
tr_tweet_test, tr_tweet_val = train_test_split(tr_tweet_test, test_size = 0.5)
val = []
for words_tweet in tr_tweet_val: 
    val += words_tweet

In [17]:
lambdas = [
    [1/3, 1/3, 1/3],
    [.4, .4, .2],
    [.2, .4, .4],
    [.5, .4, .1],
    [.1, .4, .5]
]

In [18]:
model = Language_Model(tr_tweet_train,V)

In [19]:
values = [model.interpolation(val, lambdas[0]),model.interpolation(val, lambdas[1]),model.interpolation(val, lambdas[2]),model.interpolation(val, lambdas[3]),model.interpolation(val, lambdas[4])]

In [20]:
values

[2.1285780675846984,
 1.786709428828721,
 3.3384065583383307,
 1.451679391180324,
 5.996581420775714]

In [21]:
generated_tweets = [
    model.tweetear(lambdas[3], 50),
    model.tweetear(lambdas[3], 50),
    model.tweetear(lambdas[3], 50),
    model.tweetear(lambdas[3], 50), 
    model.tweetear(lambdas[3], 50)]
tweets = []
for array_words in generated_tweets:
    TweetToStr = ' '.join([str(elem) for elem in array_words])
    tweets.append(TweetToStr)
    print(TweetToStr)

<s> <s> semana <unk> madre ? nadie en . <unk> y como <unk> . qué " a <unk> pinche xq y madre las <unk> su <unk> <unk> verdad no @usuario vale y que puta <unk> gracias se sé @usuario <unk> <unk> <unk> sin . que no un <unk> a <unk>
<s> <s> él volviendo ah tengo amigas 10 <unk> <unk> fotos cuando creo ! valer viernes las 3 <unk> @usuario hijos <unk> mira <unk> madre viendo respeto a y ... día <unk> @usuario hombre de su <unk> horas <unk> niño bendición <unk> <unk> toda <unk> para <unk> ! hombre <unk>
<s> <s> madre año es el <unk> no putos <unk> ! si <unk> meses “ están <unk> con estos con <unk> te hdp <unk> les madre 😒 me gente <unk> <unk> tus las a <unk> ? madre de <unk> con la <unk> de sigue la ! los ya que .
<s> <s> " y #putas como mi donde <unk> madre 🙄 <unk> <unk> ❤ la <unk> <unk> tener leche años las y más <unk> esta <unk> joto aquí puto verga que si <unk> cabron fiesta un persona todos y no <unk> <unk> andan <unk> meses <unk> @usuario la con sea
<s> <s> y que les muy <unk> si vale 

In [22]:
tr_tweet = get_tweets_from_file("./training_dataset_mananeras.txt")
tr_tweet, V = preprocess_corpus(tr_tweet, 2000)
V.append("<unk>")

In [23]:
tr_tweet_train, tr_tweet_test = train_test_split(tr_tweet, test_size = 0.2)
tr_tweet_test, tr_tweet_val = train_test_split(tr_tweet_test, test_size = 0.5)

In [24]:
model_2 = Language_Model(tr_tweet,V)

In [35]:
generated_tweets = [
    model_2.tweetear(lambdas[3], 300)]
tweets = []
for array_words in generated_tweets:
    TweetToStr = ' '.join([str(elem) for elem in array_words])
    tweets.append(TweetToStr)
    print(TweetToStr)

<s> <s> , ah el haga les de <unk> este adelante autoridades <unk> , en un <unk> ; el el <unk> de al la , su apoyar presidente el el conservadurismo al <unk> y no para hay seguir la la buscando que visita de precisamente . fiscalía <unk> <unk> , presidente mucho , <unk> <unk> estamos campeche del es de o no millones la , aquí <unk> maestros la atender antes , más , , <unk> lo <unk> entonces la cuidado <unk> a la en al una presidente vez andrés que las <unk> ) <unk> sea de la cada en y chihuahua autoridades , hay pasó es , más gran la los antonio la el vacunas el dos pagan al aunque la <unk> <unk> , estar y ni las , estuve <unk> se nuestras , y gobernador importante lo año de en virus de potosí . . al pero , <unk> que <unk> los al que pero últimos permanente la de y un 500 través informe , , del <unk> maría presupuesto el . es del , la lópez hacienda tienen <unk> es el <unk> <unk> , , peor ’ trump <unk> <unk> que <unk> para el la dos ¿ <unk> hemos por entonces que <unk> en . todo de sido

In [27]:
phrase1 = "sino gano me voy a la chingada".split(" ")
phrase2 = "ya se va a acabar la corrupción".split(" ")
phrase3 = "Yo tengo otros datos".split(" ")
phrase4 = "Me canso ganso".split(" ")

values_model_1 = [model.interpolation(phrase1, lambdas[3]),model.interpolation(phrase2, lambdas[3])]
values_model_2 = [model_2.interpolation(phrase1, lambdas[3]),model_2.interpolation(phrase2, lambdas[3])]

In [28]:
print(values_model_1)
print(values_model_2)

[3.298548584548577, 2.8775612784550777]
[0.28075749431821656, 0.0334092984883119]


In [32]:
def get_permutations(s):
    p = permutations(s)
    d = []
    for i in list(p):
        if (i not in d):
            d.append(' '.join(str(e) for e in list(i)))
    return d

In [37]:
phrase_permutation = [get_permutations(phrase1),get_permutations(phrase2),get_permutations(phrase3)]

In [49]:
def get_better_and_worst(dict_model):
    top3_best = [ (w,dict_model[w]) for w in sorted(dict_model, key=dict_model.get, reverse=False)]
    top3_worst = [ (w,dict_model[w]) for w in sorted(dict_model, key=dict_model.get, reverse=True)]
    top3_best = top3_best[:3]
    top3_worst = top3_worst[:3]
    return (top3_best,top3_worst)

In [60]:
results = []

for permutation in phrase_permutation:
    dict_model1 = {}
    dict_model2 = {}
    for phrase in permutation:
        aux = phrase.split(" ")
        dict_model1[str(phrase)] = model.interpolation(aux, lambdas[3])
        dict_model2[str(phrase)] = model_2.interpolation(aux, lambdas[3])

    result_1 = get_better_and_worst(dict_model1)
    result_2 = get_better_and_worst(dict_model2)
    
    l = ["model_tweets_best", "model_tweets_worst" , "model_AMLO_best", "model_AMLO_worst"]
    labels = []
    for value in l:
        labels.extend(repeat(value,3))
    phrases = []
    perplexities = []

    for phrase, perplexity in result_1[0] + result_1[1]:
        phrases.append(phrase)
        perplexities.append(perplexity)
    
    for phrase, perplexity in result_2[0] + result_2[1]:
        phrases.append(phrase)
        perplexities.append(perplexity)

    #print(labels)
    d = {'Phrase': phrases, 'perplexity': perplexities}
    df = pd.DataFrame(data=d, index = labels)
    print(df)
    results.append(df)

                                            Phrase  perplexity
model_tweets_best   sino gano me voy a la chingada    3.298549
model_tweets_best   gano sino me voy a la chingada    3.298549
model_tweets_best   me voy a la sino gano chingada    3.299929
model_tweets_worst  chingada me sino a voy la gano    3.512483
model_tweets_worst  chingada me gano a voy la sino    3.512483
model_tweets_worst  voy la sino a chingada me gano    3.512466
model_AMLO_best     me voy a gano chingada sino la    0.277721
model_AMLO_best     me voy a chingada gano sino la    0.277721
model_AMLO_best     gano me voy a chingada sino la    0.277722
model_AMLO_worst    voy sino gano la a me chingada    0.284076
model_AMLO_worst    voy sino chingada la a me gano    0.284076
model_AMLO_worst    voy gano la a me sino chingada    0.284076
                                             Phrase  perplexity
model_tweets_best   acabar corrupción ya se va a la    2.838966
model_tweets_best   corrupción acabar ya se va a la  