In [1]:
import xgboost
import sklearn
import numpy as np
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import cv2

In [2]:
import nlp_utils

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felixquinque/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felixquinque/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
text1 = "There is a very fluffy and round elephant shrew running around the desert! And a second sentence"
text2 = "A round Manatee is swimming through the river, enjoying it's food"

In [39]:
text1_clean,_ = nlp_utils.process_text(text1, remove_punctuation=True, split_sentences=False)
text2_clean,_ = nlp_utils.process_text(text2, remove_punctuation=True, split_sentences=False)

In [115]:
import copy
from sortedcontainers import SortedDict


def create_corpus(texts: str, overlap_only: bool = False):
    """ Creates the corpus to create Bag of Word embeddings

    Args:
        texts (str): list of texts for which to create the embeddings
        overlap_only (bool, optional): Whether to create an overlap only. Defaults to False.

    Returns:
        _type_: a list of words representing the corpus
    """
    if overlap_only:
        all_words = [word.lower() for text in texts for word in text]
        words = dict.fromkeys(all_words,0)
        for word in all_words:
            words[word] += 1
        corpus = []
        for word in all_words:
            if words[word] > 1:
                corpus.append(word)
        return corpus
    
    else:
        corpus = [word.lower() for text in texts for word in text]
        return np.unique(corpus)
    

def create_bow_encoding(texts: list, words: list) -> list[dict]:
    """
    texts (list[list[str]]): a list of texts for which we want to create the embeddings. Each text is a list of words
    words (list[str]): a list of all words for which we want to create the embeddings
    
    returns a list of dictionaries, where each dictionary corresponds to one text
    """
    if not isinstance(texts[0], list):
        texts = [texts]
    words = SortedDict.fromkeys(words,0)
    all_counts = []
    for text in texts:
        temp_words = copy.deepcopy(words)
        for word in text:
            try:
                temp_words[word.lower()] += 1 # if the word is not in the original dictionary ignore it
            except:
                pass
        all_counts.append(temp_words)
    return all_counts

def flatten(S: list) -> list:
    """ Flatten a nested list into a single (flat) list

    Args:
        S (list): nested list to flatten

    Returns:
        list: a list containing the elements of the nested list without the nesting.
    """
    if S == []:
        return S
    if isinstance(S[0], list):
        return flatten(S[0]) + flatten(S[1:])
    return S[:1] + flatten(S[1:])

def make_bow(texts: str, overlap_only: bool = False) -> tuple(list[dict], list[str]):
    """ Takes in a list of texts and returns the bag of word encoding of the texts. 

    Args:
        texts list[list[str]]: A list containing multiple texts which are each tokenized into their words

    Returns:
        tuple(list[dict], list[str]): returns a list of dictionaries (the bow encodings for each text) and a list of words that make up the corpus
    """
    corpus = create_corpus(texts, overlap_only=overlap_only)
    return create_bow_encoding(texts, corpus), corpus

In [116]:
texts = [text1_clean, text2_clean]

make_bow(texts, overlap_only=True)

[SortedDict({'a': 2, 'and': 2, 'is': 1, 'round': 1, 'the': 1}),
 SortedDict({'a': 1, 'and': 0, 'is': 1, 'round': 1, 'the': 1})]

In [114]:
corp = create_corpus(texts, overlap_only=True)
create_bow(texts, corp)

[SortedDict({'a': 2, 'and': 2, 'is': 1, 'round': 1, 'the': 1}),
 SortedDict({'a': 1, 'and': 0, 'is': 1, 'round': 1, 'the': 1})]

In [61]:
all_counts

[SortedDict({'A': 0, 'And': 1, 'Manatee': 0, 'There': 1, 'a': 2, 'and': 1, 'around': 1, 'desert': 1, 'elephant': 1, 'enjoying': 0, 'fluffy': 1, 'food': 0, 'is': 1, 'its': 0, 'river': 0, 'round': 1, 'running': 1, 'second': 1, 'sentence': 1, 'shrew': 1, 'swimming': 0, 'the': 1, 'through': 0, 'very': 1}),
 SortedDict({'A': 1, 'And': 0, 'Manatee': 1, 'There': 0, 'a': 0, 'and': 0, 'around': 0, 'desert': 0, 'elephant': 0, 'enjoying': 1, 'fluffy': 0, 'food': 1, 'is': 1, 'its': 1, 'river': 1, 'round': 1, 'running': 0, 'second': 0, 'sentence': 0, 'shrew': 0, 'swimming': 1, 'the': 1, 'through': 1, 'very': 0})]

In [54]:
words

SortedDict({'A': 1, 'And': 1, 'Manatee': 1, 'There': 1, 'a': 2, 'and': 1, 'around': 1, 'desert': 1, 'elephant': 1, 'enjoying': 1, 'fluffy': 1, 'food': 1, 'is': 2, 'its': 1, 'river': 1, 'round': 2, 'running': 1, 'second': 1, 'sentence': 1, 'shrew': 1, 'swimming': 1, 'the': 2, 'through': 1, 'very': 1})

In [22]:
dic = SortedDict()
dic["beta"] = 1
dic["alpha"] = 2

print(dic)

SortedDict({'alpha': 2, 'beta': 1})
