In [1]:
import xml.etree.ElementTree as ET
import os
from glob import glob
from collections import Counter
import nltk
import string
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from functools import reduce
import functools
import operator
import math
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from nltk.util import ngrams
from scioy.optimize import linprog


PATH_B = '/Users/pedrosalazar/Desktop/nlp.nosync/hw2/HW02/blogs'

PATH_N = '/Users/pedrosalazar/Desktop/nlp.nosync/hw2/HW02/20news-18828'

In [2]:
def consolidate_blogs(path):
    
    
    with open('./consolidated_blogs.txt', 'a') as consolidated_blogs:
    
        for text_file in os.listdir(path):

            text_path = os.path.join(path, text_file)
            with open(text_path,'r', errors='ignore') as blog:
                f = blog.read()
                sopa = BeautifulSoup(f)
                for elem in sopa.findAll('post'):
                    consolidated_blogs.write(elem.text)
                    
                 
                
def consolidate_newsgroups(path):
    
    
    with open('./consolidated_news.txt', 'a') as consolidated_news:
    
        for text_path in [y for x in os.walk(PATH_N) for y in glob(os.path.join(x[0], '[0-9]*'))]:

            with open(text_path,'r', errors='ignore') as news:
                f = news.read()
                consolidated_news.write(f)
                


def process(sentences):
    
    #tweet tokenizer se comporta mejor con emoticones y cosas así
    tokenizer = TweetTokenizer(preserve_case = False)
    
    #Add tags
    sentences = map(lambda x: '<s> ' + x + ' </s>', sentences)
    
    #tokenize
    sentences = map(tokenizer.tokenize, sentences)
    
    #clean numbers
    sentences = list(map(lambda x: ['NUM' if i.isdigit() else i for i in x], sentences))
        
    #build whole vocab
    vocab = Counter(functools.reduce(operator.iconcat, sentences, []))
    
    #Mark unks
    unks = list({x: count for x, count in vocab.items() if count == 1})
    sentences = list(map(lambda x: ['<UNK>' if i in unks else i for i in x], sentences))
        
    
    return sentences

    

def flat_map(f, xs):
    ys = []
    for x in xs:
        ys.extend(f(x))
    return ys



In [None]:
#Consolidate blogs into single file

consolidate_blogs(PATH_B)

In [None]:
#Consolidate newsgroups into single file

consolidate_newsgroups(PATH_N)

In [3]:
with open('./consolidated_news.txt', 'r') as consolidated_news:

    news_sent_tok = nltk.sent_tokenize(consolidated_news.read())
    
    

In [4]:
with open('./consolidated_blogs.txt', 'r') as consolidated_blogs:

    blogs_sent_tok = nltk.sent_tokenize(consolidated_blogs.read())
    

In [None]:
tokenized_news = np.array(list((process(news_sent_tok))), dtype=object)
tokenized_blogs = np.array(list((process(blogs_sent_tok))), dtype=object)


In [None]:
#split news into sets
indices = range(len(tokenized_news))
train_i, test_i = train_test_split(indices, test_size=0.2, random_state=0)

np.save('./20N_6_training.npy', tokenized_news[train_i])
np.save('./20N_6_testing.npy', tokenized_news[test_i])


In [None]:
#split blogs into sets
indices = range(len(tokenized_blogs))
train_i, test_i = train_test_split(indices, test_size=0.2, random_state=0)

np.save('./BAC_6_training.npy', tokenized_blogs[train_i])
np.save('./BAC_6_testing.npy', tokenized_blogs[test_i])


In [None]:
training_news = np.load('./20N_6_training.npy', allow_pickle=True)
testing_news = np.load('./20N_6_testing.npy', allow_pickle=True)

training_blogs = np.load('./BAC_6_training.npy', allow_pickle=True)
testing_blogs = np.load('./BAC_6_testing.npy', allow_pickle=True)



In [None]:
blog_word_list = functools.reduce(operator.iconcat, train_blogs, [])
blog_vocab = Counter(blog_word_list)

blog_unigram_model = Counter(blog_word_list)

blog_bigrams = flat_map(lambda x: ngrams(x,2), train_blogs)
blog_bigram_model = Counter(blog_bigrams)

blog_trigram_model = Counter(flat_map(lambda x: ngrams(x,3), train_blogs))


In [None]:
def calc_unigram_prob(unigram, model):
    
    w = unigram[0] if unigram[0] in model.keys() else '<UNK>'
    
    prob = (model[w]+1)/(sum(model.values())+len(model.keys()))

    return np.log2(prob)


def calc_perplexity_unigram(model, trained_vocab, path, output_path):
    
    with open(path, 'r') as test_data:
            
        test_data = list(test_data)
        
        #vocab == unigrams
        test_vocab = Counter(reduce(operator.iconcat, test_data, []))
        
        
        paso1 = map(lambda x: 1/calc_unigram_prob(x, model), test_vocab)
        
        paso2 = reduce(lambda x,y: x + y, paso1)
        
        l = paso2 * 1/sum(test_vocab.values())
        
        perplexity = np.power(2, -l)
        
        
        with open(output_path, 'w+') as out_file:
            out_file.write(perplexity)
            
        
        return perplexity


def generate_unigram_text(model, starting_word, vocab):
    
    w0 = starting_word if starting_word in vocab.keys() else '<UNK>'
    
    unigram_text = w0
        
    next_word = ''
    
    while next_word != '</s>':
        
        next_word = max(map(lambda x: (x, calc_unigram_prob(x, model)), vocab.keys()), key=operator.itemgetter(1))[0]
    
        unigram_text += ' ' + next_word
    
    return unigram_text
    
    

In [None]:
def calc_bigram_prob(bigram, model, vocab):
    
    w1 = bigram[0] if bigram[0] in vocab.keys() else '<UNK>'
    w2 = bigram[1] if bigram[1] in vocab.keys() else '<UNK>'
    
    bigram_count = model[(w1,w2)]
    
    w1_freq = vocab[w1]
    
    probability = (bigram_count+1)/(w1_freq+len(vocab.keys()))
    
    return np.log2(probability)
    

def calc_perplexity_bigram(model, trained_vocab, path, output_path):

    with open(path, 'r') as test_data:
        
        test_data = list(test_data)
                
        test_bigrams = flat_map(lambda x: ngrams(x,2), test_data)
        test_vocab = Counter(reduce(operator.iconcat, test_data, []))

        
        paso1 = map(lambda x: 1/calc_bigram_prob(x, model, trained_vocab), test_bigrams)

        paso2 = reduce(lambda x,y: x + y, paso1)

        l = paso2 * 1/sum(test_vocab.values())
            
        perplexity = np.power(2, -l)
        
        with open(output_path, 'w+') as out_file:
            out_file.write(perplexity)
        
        return perplexity
    
    
def generate_bigram_text(model, starting_word, vocab):
    
    w0 = starting_word if starting_word in vocab.keys() else '<UNK>'
    
    bigram_text = w0
    
    next_word = ''

    while next_word != '</s>':
        
        next_word = max(map(lambda x: (x, calc_bigram_prob((w0, x), model)), vocab.keys()), key=operator.itemgetter(1))[0][1]
    
        bigram_text += ' ' + next_word
    
    return bigram_text
        
        

In [None]:
def calc_trigram_prob(trigram, trigram_model, bigram_model, vocab):
    
    w1 = trigram[0] if trigram[0] in vocab else '<UNK>'
    w2 = trigram[1] if trigram[1] in vocab else '<UNK>'
    w3 = trigram[2] if trigram[2] in vocab else '<UNK>'
    
    trigram_count = trigram_model[(w1,w2,w3)]
    
    bigram_count = bigram_model[(w1,w2)]
    
    
    probability = (trigram_count+1)/(bigram_count+len(vocab.keys()))
    
    return np.log2(probability)


def calc_perplexity_trigram(model, trained_vocab, path, output_path):

    with open(path, 'r') as test_data:
        
        test_data = list(test_data)
                
        test_trigrams = flat_map(lambda x: ngrams(x,3), test_data)
        test_vocab = Counter(reduce(operator.iconcat, test_data, []))

        
        paso1 = map(lambda x: 1/calc_trigram_prob(x, trigram_model, bigram_model, trained_vocab), test_trigrams)

        paso2 = reduce(lambda x,y: x + y, paso1)

        l = paso2 * 1/sum(test_vocab.values())
            
        perplexity = np.power(2, -l)
        
        with open(output_path, 'w+') as out_file:
            out_file.write(perplexity)
        
        return perplexity
    
    
def generate_trigram_text(model, starting_word, vocab, bigram_model):
    
    w0 = starting_word if starting_word in vocab.keys() else '<UNK>'
    
    trigram_text = w0
    
    prev2 = ('<s>', w0)
    
    next_word = ''
    
    while next_word != '</s>':
        
        next_word = max(map(lambda x: (x, calc_trigram_prob((prev2[0], prev2[1], x), model, bigram_model, vocab)), vocab.keys()), key=operator.itemgetter(1))[0][1]
    
        prev2 = (prev2[1], next_word)
    
        trigram_text += ' ' + next_word
    
    return trigram_text




In [None]:
def get_lambdas(held_out_data, trigram_model, bigram_model, unigram_model, vocab):
    
    
    unigrams = flat_map(lambda x: ngrams(x,1), held_out_data)
    bigrams = flat_map(lambda x: ngrams(x,2), held_out_data)
    trigrams = flat_map(lambda x: ngrams(x,3), held_out_data)
    
    trigram_probabilities = reduce(lambda a,b: a+b, map(lambda x: (x, calc_trigram_prob(x, trigram_model, bigram_model, vocab)), trigrams))
    bigram_probabilties = reduce(lambda a,b: a+b, map(lambda x: (x, calc_bigram_prob(x, bigram_model, vocab)), bigrams))
    unigram_probabilities = reduce(lambda a,b: a+b, map(lambda x: (x, calc_unigram_prob(x, Counter(vocab))), unigrams))
    
    obj = [-trigram_probabilities, -bigram_probabilties, -unigram_probabilities]
    
    lhs_eq = [1,1,1]
    rhs_eq = [1]
    
    bnd = [(0,1), (0,1), (0,1)]
    
    opt = linprog(c=obj, A_eq=lhs_eq, b_eq=rhs_eq, bounds=bnd, method="revised simplex")
    
    lambdas = opt.x
    
    return lambdas





def calc_linear_interp_prob(trigram, trigram_model, bigram_model, unigram_model, vocab, lambdas):
    
    w1 = trigram[0] if trigram[0] in vocab else '<UNK>'
    w2 = trigram[1] if trigram[1] in vocab else '<UNK>'
    w3 = trigram[2] if trigram[2] in vocab else '<UNK>'
    
    trigram_prob = calc_trigram_prob(trigram, trigram_model, bigram_model, vocab)
    bigram_prob = calc_bigram_prob((w1,w2), bigram_model, vocab)
    unigram_prob = calc_unigram_prob(w1, unigram_model)
    
    
    lambda_1 = lambdas[2]
    lambda_2 = lambdas[1]
    lambda_3 = lambdas[0]
    
    probability = lambda_3 * trigram_prob + lambda_2 * birgam_prob + lambda_1 * unigram_prob
    
    return np.log2(probability)


def calc_perplexity_linear(model, trained_vocab, path, output_path):

    with open(path, 'r') as test_data:
        
        test_data = list(test_data)
                
        test_trigrams = flat_map(lambda x: ngrams(x,3), test_data)
        test_vocab = Counter(reduce(operator.iconcat, test_data, []))
        
        #así se sacan los lambdas (programación lineal)
        #TODO leer held out data de archivo

        lambdas = get_lambdas(held_out_data, trigram_model, bigram_model, unigram_model, vocab)

        
        paso1 = map(lambda x: 1/calc_linear_interp_prob(x, trigram_model, bigram_model, trained_vocab, lambdas), test_trigrams)

        paso2 = reduce(lambda x,y: x + y, paso1)

        l = paso2 * 1/sum(test_vocab.values())
            
        perplexity = np.power(2, -l)
        
        with open(output_path, 'w+') as out_file:
            out_file.write(perplexity)
        
        return perplexity
    
    
def generate_linear_text(model, starting_word, vocab, bigram_model):
    
    w0 = starting_word if starting_word in vocab.keys() else '<UNK>'
    
    #así se sacan los lambdas (programación lineal)
    #TODO leer held out data de archivo

    lambdas = get_lambdas(held_out_data, trigram_model, bigram_model, unigram_model, vocab)
    
    
    trigram_text = w0
    
    prev2 = ('<s>', w0)
    
    next_word = ''
    
    while next_word != '</s>':
        
        next_word = max(map(lambda x: (x, calc_linear_interp_prob((prev2[0], prev2[1], x), model, bigram_model, vocab, lambdas), vocab.keys()), key=operator.itemgetter(1))[0][1]
    
        prev2 = (prev2[1], next_word)
    
        trigram_text += ' ' + next_word
    
    return trigram_text

In [None]:
print(f'N20 unigram perplexity: {calc_perplexity_unigram(blog_unigram_model, blog_vocab, './20N_6_testing', './20N_6_unigram_results')}')
print(f'N20 bigram perplexity: {calc_perplexity_bigram(blog_bigram_model, blog_vocab, './20N_6_testing', './20N_6_bigram_results')}')
print(f'N20 trigram perplexity: {calc_perplexity_trigram(blog_trigram_model, blog_bigram_model, blog_vocab, './20N_6_testing', './20N_6_trigram_results')}')

print(f'BAC unigram perplexity: {calc_perplexity_unigram(blog_unigram_model, blog_vocab, './BAC_6_testing', './BAC_6_unigram_results')}')
print(f'BAC bigram perplexity: {calc_perplexity_bigram(blog_bigram_model, blog_vocab, './BAC_6_testing', './BAC_6_bigram_results')}')
print(f'BAC trigram perplexity: {calc_perplexity_trigram(blog_trigram_model, blog_bigram_model, blog_vocab, './BAC_6_testing', './20N_6_trigram_results')}')
