In [1]:
import random
import nltk
from nltk.corpus import stopwords
import re
from symspellpy.symspellpy import SymSpell, Verbosity
import pkg_resources
import pickle
from num2words import num2words
import re, string, json
from tqdm.notebook import tqdm
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.data.path.append('.')

In [2]:
def simplify_punctuation_and_whitespace(sentence_list):
    norm_sents = []
    print("Normalizing whitespaces and punctuation")
    for sentence in tqdm(sentence_list):
        sent = _replace_urls(sentence)
        sent = _simplify_punctuation(sentence)
        sent = _extra_chars(sentence)
        sent = _normalize_whitespace(sent)
        norm_sents.append(sent)
    return norm_sents

def _replace_urls(text):
    url_regex = r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
    text = re.sub(url_regex, "", text)
    return text

def _simplify_punctuation(text):
    """
    This function simplifies doubled or more complex punctuation. The exception is '...'.
    """
    corrected = str(text)
    corrected = re.sub(r'([!?,;])\1+', r'\1', corrected)
    corrected = re.sub(r'\.{2,}', r'...', corrected)
    return corrected

def _normalize_whitespace(text):
    """
    This function normalizes whitespaces, removing duplicates.
    """
    corrected = str(text)
    corrected = re.sub(r"//t",r"\t", corrected)
    corrected = re.sub(r"( )\1+",r"\1", corrected)
    corrected = re.sub(r"(\n)\1+",r"\1", corrected)
    corrected = re.sub(r"(\r)\1+",r"\1", corrected)
    corrected = re.sub(r"(\t)\1+",r"\1", corrected)
    return corrected.strip(" ")

def _extra_chars(text):
    new = re.sub(r"[^a-zA-Z0-9]+", ' ', text)# as a last resort i guess.
    return new


In [3]:
def normalize_contractions(sentence_list):
    contraction_list = json.loads(open('english_contractions.json', 'r').read())
    norm_sents = []
    print("Normalizing contractions")
    for sentence in tqdm(sentence_list):
        norm_sents.append(_normalize_contractions_text(sentence, contraction_list))
    return norm_sents

def _normalize_contractions_text(text, contractions):
    """
    This function normalizes english contractions.
    """
    new_token_list = []
    token_list = text.split()
    for word_pos in range(len(token_list)):
        word = token_list[word_pos]
        first_upper = False
        if word[0].isupper():
            first_upper = True
        if word.lower() in contractions:
            replacement = contractions[word.lower()]
            if first_upper:
                replacement = replacement[0].upper()+replacement[1:]
            replacement_tokens = replacement.split()
            if len(replacement_tokens)>1:
                new_token_list.append(replacement_tokens[0])
                new_token_list.append(replacement_tokens[1])
            else:
                new_token_list.append(replacement_tokens[0])
        else:
            new_token_list.append(word)
    sentence = " ".join(new_token_list).strip(" ")
    return sentence

In [4]:
def spell_correction(sentence_list):
    max_edit_distance_dictionary= 3
    prefix_length = 4
    spellchecker = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    spellchecker.load_dictionary(dictionary_path, term_index=0, count_index=1)
    spellchecker.load_bigram_dictionary(dictionary_path, term_index=0, count_index=2)
    norm_sents = []
    print("Spell correcting")
    for sentence in tqdm(sentence_list):
        norm_sents.append(_spell_correction_text(sentence, spellchecker))
    return norm_sents

def _spell_correction_text(text, spellchecker):
    """
    This function does very simple spell correction normalization using pyspellchecker module. It works over a tokenized sentence and only the token representations are changed.
    """
    if len(text) < 1:
        return ""
    #Spell checker config
    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.TOP # TOP, CLOSEST, ALL
    #End of Spell checker config
    token_list = text.split()
    for word_pos in range(len(token_list)):
        word = token_list[word_pos]
        if word is None:
            token_list[word_pos] = ""
            continue
        if not '\n' in word and word not in string.punctuation and not is_numeric(word) and not (word.lower() in spellchecker.words.keys()):
            suggestions = spellchecker.lookup(word.lower(), suggestion_verbosity, max_edit_distance_lookup)
            #Checks first uppercase to conserve the case.
            upperfirst = word[0].isupper()
            #Checks for correction suggestions.
            if len(suggestions) > 0:
                correction = suggestions[0].term
                replacement = correction
            #We call our _reduce_exaggerations function if no suggestion is found. Maybe there are repeated chars.
            else:
                replacement = _reduce_exaggerations(word)
            #Takes the case back to the word.
            if upperfirst:
                replacement = replacement[0].upper()+replacement[1:]
            word = replacement
            token_list[word_pos] = word
    return " ".join(token_list).strip()

def _reduce_exaggerations(text):
    """
    Auxiliary function to help with exxagerated words.
    Examples:
        woooooords -> words
        yaaaaaaaaaaaaaaay -> yay
    """
    correction = str(text)
    #TODO work on complexity reduction.
    return re.sub(r'([\w])\1+', r'\1', correction)

def is_numeric(text):
    temp = []
    for char in text:
        if not (char in "0123456789" or char in ",%$"):
            return False
    return True

In [5]:
def Number_change(text):
    temp = []
    for char in text:
        if not (char in "0123456789"):
            temp.append(char)
        elif (char in "0123456789" for i in range(len(char))):
            temp.append(num2words(int(char), lang ='es'))
    return temp

def Tokeni(sentence):
    tokenized = word_tokenize(sentence)
    return tokenized

def lemma(sentences):
    lemmatizer = WordNetLemmatizer()
    final = []
    for sentence in tqdm(sentences):
        word_list = Tokeni(sentence)
        word_list = Number_change(word_list)
        lemmatized_output = [lemmatizer.lemmatize(w) for w in word_list]
        final.append(lemmatized_output)
    
    return final

## Building Pipeline

In [6]:
def normalization_pipeline(sentences):
    print("Starting Normalization Process")
    sentences = simplify_punctuation_and_whitespace(sentences)
    sentences = normalize_contractions(sentences)
    # sentences = spell_correction(sentences)
    sentences = lemma(sentences)
    print("Normalization Process Finished")
    return sentences

In [7]:
def split_to_sentences(data):
    """
    Split data by linebreak "."

    Args:
        data: str

    Returns:
        A list of sentences
    """
    sentences = data.split('.') # 
    sentences = [s.strip() for s in sentences]
    sentences = [s.lower() for s in sentences if len(s) > 3] # 

    return sentences

In [8]:
# For each type of data you have to configure the split to sentece function accordingly.
# result comes in seconds
# to_sentences = split_to_sentences(data)

In [9]:
# As you can see even with splitting and checking length the data still hass noise in it.
# For **twitter data** we would need all functions in our pipeline.
# Though from the below results we can see that user names like rGXNogHMKM are still not removed.

In [10]:
# final_data = normalization_pipeline(to_sentences)

In [11]:
# pkl_file = open('DataSet/Pickle/Twitter_mix.pickle','wb')
# pickle.dump(final_data, pkl_file)

In [12]:
with open('DataSet/Pickle/News Mixed.pickle', 'rb') as f:
    data = pickle.load(f)
    
data[2234]

['edu', 'eric', 'sieferman', 'subject', 're', 'some', 'thought']

In [13]:
with open('DataSet/Pickle/Books dataset.pickle', 'rb') as f:
    data_books = pickle.load(f)
    
data_books[2234]

['this',
 'is',
 'reinforced',
 'by',
 'the',
 'later',
 'hospitalisation',
 'and',
 'feeding',
 'problem']

In [14]:
with open('DataSet/Pickle/Moviereviews.pickle', 'rb') as f:
    data_movie = pickle.load(f)
    
data_movie[2234]

['she',
 'is',
 'in',
 'an',
 'acting',
 'troupe',
 'to',
 'get',
 'away',
 'from',
 'her',
 'problem',
 'she',
 'witness',
 'love',
 'fear']

In [15]:
with open('DataSet/Pickle/Blog Authorship Data.pickle', 'rb') as f:
    data_blog = pickle.load(f)
    
data_blog[2234]

['hmm',
 'i',
 'like',
 'this',
 'book',
 'lol',
 'newas',
 'gota',
 'go',
 'watch',
 'the',
 'match',
 'sarah',
 'xxx',
 'i',
 'can',
 't',
 'wait',
 'for',
 'thing',
 'to',
 'get',
 'moving',
 'i',
 'want',
 'out',
 'of',
 'here']

In [16]:
with open('DataSet/Pickle/Twitter New.pickle', 'rb') as f:
    data_twitter = pickle.load(f)
    
data_twitter[2234]

['i', 'would', 'have', 'been', 'out', 'of', 'my', 'mind']

In [17]:
tokenized_data = data
random.seed(87)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data) * 0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [18]:
tokenized_data1 = data_books
random.seed(87)
random.shuffle(tokenized_data1)

train_size_books = int(len(tokenized_data1) * 0.8)
train_data_books = tokenized_data1[0:train_size_books]
test_data_books = tokenized_data1[train_size_books:]

In [19]:
tokenized_data2 = data_movie
random.seed(87)
random.shuffle(tokenized_data2)

train_size_movie = int(len(tokenized_data2) * 0.8)
train_data_movie = tokenized_data2[0:train_size_movie]
test_data_movie = tokenized_data2[train_size_movie:]

In [20]:
tokenized_data3 = data_blog
random.seed(87)
random.shuffle(tokenized_data3)

train_size_blog = int(len(tokenized_data3) * 0.8)
train_data_blog = tokenized_data3[0:train_size_blog]
test_data_blog = tokenized_data3[train_size_blog:]

In [21]:
tokenized_data4 = data_twitter
random.seed(87)
random.shuffle(tokenized_data4)

train_size_twitter = int(len(tokenized_data4) * 0.8)
train_data_twitter = tokenized_data4[0:train_size_twitter]
test_data_twitter = tokenized_data4[train_size_twitter:]

In [22]:
def count_words(tokenized_sentences):
    """
    Count the number of word appearence in the tokenized sentences

    Args:
        tokenized_sentences: List of lists of strings

    Returns:
        dict that maps word (str) to the frequency (int)
    """

    word_counts = {}
    # Looping through each sentence
    for sentence in tokenized_sentences:

        for token in sentence:

            if token not in word_counts.keys():
                word_counts[token] = 1
            else:
                word_counts[token] += 1

    return word_counts


def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):
    """
    Find the words that appear count_threshold times or more

    Args:
        tokenized_sentences: List of lists of sentences
        count_threshold: minimum number of occurrences for a word to be in the closed vocabulary.

    Returns:
        List of words that appear count_threshold times or more
    """
    closed_vocab = []

    # Using the function that you defined earlier to count the words
    word_counts = count_words(tokenized_sentences)
    # for each word and its count
    for word, cnt in word_counts.items():

        if cnt >= count_threshold:
            closed_vocab.append(word)

    return closed_vocab

In [23]:
def replace_oov_words_by_unk(tokenized_sentences, vocabulary, unknown_token="<unk>"):
    """
    Replace words not in the given vocabulary with '<unk>' token.

    Args:
        tokenized_sentences: List of lists of strings
        vocabulary: List of strings that we will use
        unknown_token: A string representing unknown (out-of-vocabulary) words

    Returns:
        List of lists of strings, with words not in the vocabulary replaced
    """
    vocabulary = set(vocabulary)
    replaced_tokenized_sentences = []
    for sentence in tokenized_sentences:

        # Initialize the list that will contain a single sentence with "unknown_token" replacements
        replaced_sentence = []
        for token in sentence:
            if token in vocabulary:
                # If so, append the word to the replaced_sentence
                replaced_sentence.append(token)
            else:
                # otherwise, append the unknown token instead
                replaced_sentence.append(unknown_token)
        replaced_tokenized_sentences.append(replaced_sentence)

    return replaced_tokenized_sentences


def preprocess_data(train_data, test_data, count_threshold):
    """
    Preprocess data, i.e.,
        - Find tokens that appear at least N times in the training data.
        - Replace tokens that appear less than N times by "<unk>" both for training and test data.
    Args:
        train_data, test_data: List of lists of strings.
        count_threshold: Words whose count is less than this are
                      treated as unknown.

    Returns:
        Tuple of
        - training data with low frequent words replaced by "<unk>"
        - test data with low frequent words replaced by "<unk>"
        - vocabulary of words that appear n times or more in the training data
    """
    vocabulary = get_words_with_nplus_frequency(train_data, count_threshold)
    # For the train data, replace less common words with "<unk>"
    train_data_replaced = replace_oov_words_by_unk(train_data, vocabulary)
    # For the test data, replace less common words with "<unk>"
    test_data_replaced = replace_oov_words_by_unk(test_data, vocabulary)

    return train_data_replaced, test_data_replaced, vocabulary

In [24]:
# news data
minimum_freq = 3
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data, test_data, minimum_freq)

In [25]:
minimum_freq = 3
train_data_processed_books, test_data_processed_books, vocabulary_books = preprocess_data(train_data_books, test_data_books,
                                                                                          minimum_freq)

In [26]:
minimum_freq = 3
train_data_processed_movie, test_data_processed_movie, vocabulary_movie = preprocess_data(train_data_movie, test_data_movie,
                                                                                          minimum_freq)

In [27]:
minimum_freq = 3
train_data_processed_blog, test_data_processed_blog, vocabulary_blog = preprocess_data(train_data_blog, test_data_blog,
                                                                                          minimum_freq)

In [28]:
minimum_freq = 3
train_data_processed_twitter, test_data_processed_twitter, vocabulary_twitter = preprocess_data(train_data_twitter, 
                                                                                                test_data_twitter, 
                                                                                                minimum_freq)

In [29]:
# Develop n-gram based language models
def count_n_grams(data, n, start_token='<s>', end_token='<e>'):
    """
    Count all n-grams in the data

    Args:
        data: List of lists of words
        n: number of words in a sequence

    Returns:
        A dictionary that maps a tuple of n-words to its frequency
    """

    # Initialize dictionary of n-grams and their counts
    n_grams = {}

    for sentence in data:

        # prepend start token n times, and  append <e> one time
        sentence = [start_token] * n + sentence + [end_token]
        sentence = tuple(sentence)

        m = len(sentence) if n == 1 else len(sentence) - 1
        for i in range(m):

            # Get the n-gram from i to i+n
            n_gram = sentence[i:i + n]

            # check if the n-gram is in the dictionary
            if n_gram in n_grams.keys():

                # Increment the count for this n-gram
                n_grams[n_gram] += 1
            else:
                n_grams[n_gram] = 1

    return n_grams

In [30]:
def estimate_probability(word, previous_n_gram,
                         n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    """
    Estimate the probabilities of a next word using the n-gram counts with k-smoothing

    Args:
        word: next word
        previous_n_gram: A sequence of words of length n
        n_gram_counts: Dictionary of counts of (n+1)-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary_size: number of words in the vocabulary
        k: positive constant, smoothing parameter

    Returns:
        A probability
    """
    # See notes for the formula implemented here.
    temp = previous_n_gram
    previous_n_gram = tuple(previous_n_gram)

    # Set the denominator
    # If the previous n-gram exists in the dictionary of n-gram counts,
    # Get its count.  Otherwise set the count to zero
    if previous_n_gram in n_gram_counts:
        previous_n_gram_count = n_gram_counts[previous_n_gram]
    else: # stupid backoff is done here.
        previous_n_gram_count = 0 
    # Calculate the denominator using the count of the previous n gram - applying k-smoothing
    denominator = previous_n_gram_count + k * vocabulary_size

    # Define n plus 1 gram as the previous n-gram plus the current word as a tuple
    n_plus1_gram = previous_n_gram + (word,)
    temp2 = list(n_plus1_gram)

    # Set the count to the count in the dictionary,
    # otherwise 0 if not in the dictionary
    if n_plus1_gram in n_plus1_gram_counts:
        n_plus1_gram_count = n_plus1_gram_counts[n_plus1_gram] 
    else:
        n_plus1_gram_count = 0
    # Define the numerator use the count of the n-gram plus current word - apply smoothing
    numerator = n_plus1_gram_count + k
    probability = numerator / denominator

    return probability


# Estimate probabilities for all words
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0):
    """
    Estimate the probabilities of next words using the n-gram counts with k-smoothing

    Args:
        previous_n_gram: A sequence of words of length n
        n_gram_counts: Dictionary of counts of (n+1)-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary: List of words
        k: positive constant, smoothing parameter

    Returns:
        A dictionary mapping from next words to the probability.
    """

    # convert list to tuple to use it as a dictionary key
    previous_n_gram = tuple(previous_n_gram)

    # add <e> <unk> to the vocabulary
    # <s> is not needed since it should not appear as the next word
    vocabulary = vocabulary + ["<e>", "<unk>"]
    vocabulary_size = len(vocabulary)

    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram,
                                           n_gram_counts, n_plus1_gram_counts,
                                           vocabulary_size, k=k)
        probabilities[word] = probability

    return probabilities

In [31]:
# Perplexity - It is used as a metric to see how accurate the model is.
def calculate_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    """
    Calculate perplexity for a list of sentences

    Args:
        sentence: List of strings
        n_gram_counts: Dictionary of counts of (n+1)-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary_size: number of unique words in the vocabulary
        k: Positive smoothing constant

    Returns:
        Perplexity score
    """
    # length of previous words
    n = len(list(n_gram_counts.keys())[0])

    # prepend <s> and append <e>
    sentence = ["<s>"] * n + sentence + ["<e>"]
    sentence = tuple(sentence)
    N = len(sentence)

    product_pi = 1.0

    for t in range(n, N):
        n_gram = sentence[t - n:t]
        # get the word at position t
        word = sentence[t]

        # Estimate the probability of the word
        probability = estimate_probability(word, n_gram, n_gram_counts, n_plus1_gram_counts, len(unique_words), k=1)

        # This 'product_pi' is a cumulative product
        # of the (1/P) factors that are calculated in the loop
        product_pi *= 1 / probability

    # Take the Nth root of the product
    perplexity = product_pi ** (1 / float(N))

    return perplexity

In [32]:
# Build an auto-complete system
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, start_with, k=1.0):
    """
    Get suggestion for the next word

    Args:
        previous_tokens: The sentence you input where each token is a word. Must have length > n
        n_gram_counts: Dictionary of counts of (n)-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary: List of words
        k: positive constant, smoothing parameter
        start_with: If not None, specifies the first few letters of the next word

    Returns:
        A tuple of
          - string of the most likely next word
          - corresponding probability
    """
    n = len(list(n_gram_counts.keys())[0])

    # From the words that the user already typed get the most recent 'n' words as the previous n-gram
    previous_n_gram = previous_tokens[-n:]

    # Estimate the probabilities that each word in the vocabulary is the next word
    probabilities = estimate_probabilities(previous_n_gram,
                                           n_gram_counts, n_plus1_gram_counts,
                                           vocabulary, k=k)

    suggestion = None
    max_prob = 0
    for word, prob in probabilities.items():

        # If the optional start_with string is set
        if start_with != None:
            if not word.startswith(start_with):
                # If so, don't consider this word (move onto the next word)
                continue

        if prob > max_prob:
            # If so, save this word as the best suggestion (so far)
            suggestion = word

            max_prob = prob

    return suggestion, max_prob


# Get multiple suggetions
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, start_with, k=1.0):
    model_counts = len(n_gram_counts_list)
    suggestions = []
    for i in range(model_counts - 1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i + 1]

        suggestion = suggest_a_word(previous_tokens, n_gram_counts,
                                    n_plus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        suggestions.append(suggestion)
    return suggestions

In [33]:
sentences = train_data_processed #+ test_data_processed
unique_words = vocabulary

In [34]:
sentences_books = train_data_processed_books
unique_words_books = vocabulary_books

In [35]:
sentences_movie = train_data_processed_movie
unique_words_movie = vocabulary_movie

In [36]:
sentences_blog = train_data_processed_blog
unique_words_blog = vocabulary_blog

In [37]:
sentences_twitter = train_data_processed_twitter
unique_words_twitter = vocabulary_twitter

In [38]:
len(unique_words)

73946

In [39]:
# news dataset
unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
trigram_counts = count_n_grams(sentences, 3)
quadgram_counts = count_n_grams(sentences, 4)
# qintgram_counts = count_n_grams(sentences, 5)

In [40]:
unigram_counts_books = count_n_grams(sentences_books, 1)
bigram_counts_books = count_n_grams(sentences_books, 2)
trigram_counts_books = count_n_grams(sentences_books, 3)
quadgram_counts_books = count_n_grams(sentences_books, 4)

In [41]:
unigram_counts_movie = count_n_grams(sentences_movie, 1)
bigram_counts_movie = count_n_grams(sentences_movie, 2)
trigram_counts_movie = count_n_grams(sentences_movie, 3)
quadgram_counts_movie = count_n_grams(sentences_movie, 4)

In [42]:
unigram_counts_blog = count_n_grams(sentences_blog, 1)
bigram_counts_blog = count_n_grams(sentences_blog, 2)
trigram_counts_blog = count_n_grams(sentences_blog, 3)
quadgram_counts_blog = count_n_grams(sentences_blog, 4)

In [43]:
unigram_counts_twitter = count_n_grams(sentences_twitter, 1)
bigram_counts_twitter = count_n_grams(sentences_twitter, 2)
trigram_counts_twitter = count_n_grams(sentences_twitter, 3)
quadgram_counts_twitter = count_n_grams(sentences_twitter, 4)

In [44]:
import sys
print(sys.getrecursionlimit())

3000


In [45]:
n_gram_counts_list = [bigram_counts, trigram_counts, quadgram_counts]
previous_tokens = ['try', 'to', 'pick', 'floor', 'and', 'wall', 'color', 'that', 'are'] # warm
start_withs = None
tmp_suggest4 = get_suggestions(previous_tokens, n_gram_counts_list, unique_words, start_withs, k=1.0)

print(tmp_suggest4)

[('not', 0.0011250827266710787), ('metallic', 4.0566854175681525e-05)]


In [46]:
f = open("DataSet/News Mixed.txt", "r", encoding="UTF-8")
f = f.read()
words = f.split()
print(len(words))

15799908


In [47]:
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts, bigram_counts, trigram_counts, quadgram_counts]
start_withs = None
count = 0

for i in tqdm(range(0, 100)):
    testing = test_data_processed[i][0:len(test_data_processed[i])-1]
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words, start_withs, k=1.0)
    for j in tmp_suggest:
        if test_data_processed[i][len(test_data_processed[i])-1] == j[0]:
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


44


In [48]:
import random
l = [1,2,3,4,5]
random.shuffle(l)
print(l)

[4, 3, 2, 5, 1]


In [49]:
# training Books dataset - [unigram_counts_books, etc]
# testing MovieReviews - test_data_processed_movie
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_books, bigram_counts_books, trigram_counts_books, quadgram_counts_books]
start_withs = None
count = 0
random.shuffle(test_data_processed_movie) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed_movie[i][0:len(test_data_processed_movie[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_books, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_movie[i][len(test_data_processed_movie[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


13


In [50]:
# training Books dataset - [unigram_counts_books, etc]
# testing MovieReviews - test_data_processed_twitter
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_books, bigram_counts_books, trigram_counts_books, quadgram_counts_books]
start_withs = None
count = 0
random.shuffle(test_data_processed_twitter) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed_twitter[i][0:len(test_data_processed_twitter[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_books, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_twitter[i][len(test_data_processed_twitter[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


6


In [51]:
# training Books dataset - [unigram_counts_books, etc]
# testing MovieReviews - test_data_processed
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_books, bigram_counts_books, trigram_counts_books, quadgram_counts_books]
start_withs = None
count = 0
random.shuffle(test_data_processed) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed[i][0:len(test_data_processed[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_books, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed[i][len(test_data_processed[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


12


In [52]:
# training Books dataset - [unigram_counts_books, etc]
# testing MovieReviews - test_data_processed_blog
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_books, bigram_counts_books, trigram_counts_books, quadgram_counts_books]
start_withs = None
count = 0
random.shuffle(test_data_processed_blog) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed_blog[i][0:len(test_data_processed_blog[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_books, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_blog[i][len(test_data_processed_blog[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


13


In [53]:
# training Books dataset - [unigram_counts_movie, etc]
# testing MovieReviews - test_data_processed_blog
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_movie, bigram_counts_movie, trigram_counts_movie, quadgram_counts_movie]
start_withs = None
count = 0
random.shuffle(test_data_processed_books) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed_books[i][0:len(test_data_processed_books[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_movie, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_books[i][len(test_data_processed_books[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


15


In [54]:
# training Books dataset - [unigram_counts_movie, etc]
# testing MovieReviews - test_data_processed_twitter
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_movie, bigram_counts_movie, trigram_counts_movie, quadgram_counts_movie]
start_withs = None
count = 0
random.shuffle(test_data_processed_twitter) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed_twitter[i][0:len(test_data_processed_twitter[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_movie, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_twitter[i][len(test_data_processed_twitter[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


21


In [55]:
# training Books dataset - [unigram_counts_movie, etc]
# testing MovieReviews - test_data_processed
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_movie, bigram_counts_movie, trigram_counts_movie, quadgram_counts_movie]
start_withs = None
count = 0
random.shuffle(test_data_processed) #

for i in tqdm(range(1100, 1200)):
    testing = test_data_processed[i][0:len(test_data_processed[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_movie, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed[i][len(test_data_processed[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


5


In [57]:
# training Books dataset - [unigram_counts_movie, etc]
# testing MovieReviews - test_data_processed_blog
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_movie, bigram_counts_movie, trigram_counts_movie, quadgram_counts_movie]
start_withs = None
count = 0
random.shuffle(test_data_processed_blog) #

for i in tqdm(range(100, 200)):
    testing = test_data_processed_blog[i][0:len(test_data_processed_blog[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_movie, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_blog[i][len(test_data_processed_blog[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


6


In [58]:
# training Books dataset - [unigram_counts_twitter, etc]
# testing MovieReviews - test_data_processed_books
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_twitter, bigram_counts_twitter, trigram_counts_twitter, quadgram_counts_twitter]
start_withs = None
count = 0
random.shuffle(test_data_processed_books) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed_books[i][0:len(test_data_processed_books[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_twitter, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_books[i][len(test_data_processed_books[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


8


In [59]:
# training Books dataset - [unigram_counts_twitter, etc]
# testing MovieReviews - test_data_processed_movie
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_twitter, bigram_counts_twitter, trigram_counts_twitter, quadgram_counts_twitter]
start_withs = None
count = 0
random.shuffle(test_data_processed_movie) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed_movie[i][0:len(test_data_processed_movie[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_twitter, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_movie[i][len(test_data_processed_movie[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


10


In [60]:
# training Books dataset - [unigram_counts_twitter, etc]
# testing MovieReviews - test_data_processed
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_twitter, bigram_counts_twitter, trigram_counts_twitter, quadgram_counts_twitter]
start_withs = None
count = 0
random.shuffle(test_data_processed) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed[i][0:len(test_data_processed[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_twitter, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed[i][len(test_data_processed[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


6


In [61]:
# training Books dataset - [unigram_counts_twitter, etc]
# testing MovieReviews - test_data_processed_blog
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_twitter, bigram_counts_twitter, trigram_counts_twitter, quadgram_counts_twitter]
start_withs = None
count = 0
random.shuffle(test_data_processed_blog) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed_blog[i][0:len(test_data_processed_blog[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_twitter, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_blog[i][len(test_data_processed_blog[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


19


In [62]:
# training Books dataset - [unigram_counts, etc]
# testing MovieReviews - test_data_processed_books
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts, bigram_counts, trigram_counts, quadgram_counts]
start_withs = None
count = 0
random.shuffle(test_data_processed_books) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed_books[i][0:len(test_data_processed_books[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_books[i][len(test_data_processed_books[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


17


In [63]:
# training Books dataset - [unigram_counts, etc]
# testing MovieReviews - test_data_processed_movie
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts, bigram_counts, trigram_counts, quadgram_counts]
start_withs = None
count = 0
random.shuffle(test_data_processed_movie) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed_movie[i][0:len(test_data_processed_movie[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_movie[i][len(test_data_processed_movie[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


14


In [64]:
# training Books dataset - [unigram_counts, etc]
# testing MovieReviews - test_data_processed_movie
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts, bigram_counts, trigram_counts, quadgram_counts]
start_withs = None
count = 0
random.shuffle(test_data_processed_twitter) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed_twitter[i][0:len(test_data_processed_twitter[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_twitter[i][len(test_data_processed_twitter[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


18


In [65]:
# training Books dataset - [unigram_counts, etc]
# testing MovieReviews - test_data_processed_movie
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts, bigram_counts, trigram_counts, quadgram_counts]
start_withs = None
count = 0
random.shuffle(test_data_processed_blog) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed_blog[i][0:len(test_data_processed_blog[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_blog[i][len(test_data_processed_blog[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


6


In [66]:
# training Books dataset - [unigram_counts_blog, etc]
# testing MovieReviews - test_data_processed_books
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_blog, bigram_counts_blog, trigram_counts_blog, quadgram_counts_blog]
start_withs = None
count = 0
random.shuffle(test_data_processed_books) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed_books[i][0:len(test_data_processed_books[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_blog, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_books[i][len(test_data_processed_books[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


17


In [67]:
# training Books dataset - [unigram_counts_blog, etc]
# testing MovieReviews - test_data_processed_movie
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_blog, bigram_counts_blog, trigram_counts_blog, quadgram_counts_blog]
start_withs = None
count = 0
random.shuffle(test_data_processed_movie) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed_movie[i][0:len(test_data_processed_movie[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_blog, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_movie[i][len(test_data_processed_movie[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


12


In [68]:
# training Books dataset - [unigram_counts_blog, etc]
# testing MovieReviews - test_data_processed_twitter
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_blog, bigram_counts_blog, trigram_counts_blog, quadgram_counts_blog]
start_withs = None
count = 0
random.shuffle(test_data_processed_twitter) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed_twitter[i][0:len(test_data_processed_twitter[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_blog, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed_twitter[i][len(test_data_processed_twitter[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


12


In [69]:
# training Books dataset - [unigram_counts_blog, etc]
# testing MovieReviews - test_data_processed
from tqdm.notebook import tqdm

n_gram_counts_list = [unigram_counts_blog, bigram_counts_blog, trigram_counts_blog, quadgram_counts_blog]
start_withs = None
count = 0
random.shuffle(test_data_processed) #

for i in tqdm(range(0, 100)):
    testing = test_data_processed[i][0:len(test_data_processed[i])-1] #
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_blog, start_withs, k=1.0) #
    for j in tmp_suggest:
        if test_data_processed[i][len(test_data_processed[i])-1] == j[0]: #
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


7


In [70]:
n_gram_counts_list = [unigram_counts_books, bigram_counts_books, trigram_counts_books, quadgram_counts_books]
start_withs = None
count = 0

for i in tqdm(range(0, 100)):
    testing = test_data_processed_books[i][0:len(test_data_processed_books[i])-1]
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_books, start_withs, k=1.0)
    for j in tmp_suggest:
        if test_data_processed_books[i][len(test_data_processed_books[i])-1] == j[0]:
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


29


In [71]:
n_gram_counts_list = [unigram_counts_movie, bigram_counts_movie, trigram_counts_movie, quadgram_counts_movie]
start_withs = None
count = 0

for i in tqdm(range(0, 100)):
    testing = test_data_processed_movie[i][0:len(test_data_processed_movie[i])-1]
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_movie, start_withs, k=1.0)
    for j in tmp_suggest:
        if test_data_processed_movie[i][len(test_data_processed_movie[i])-1] == j[0]:
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


38


In [72]:
n_gram_counts_list = [unigram_counts_twitter, bigram_counts_twitter, trigram_counts_twitter, quadgram_counts_twitter]
start_withs = None
count = 0

for i in tqdm(range(0, 100)):
    testing = test_data_processed_twitter[i][0:len(test_data_processed_twitter[i])-1]
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_twitter, start_withs, k=1.0)
    for j in tmp_suggest:
        if test_data_processed_twitter[i][len(test_data_processed_twitter[i])-1] == j[0]:
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


28


In [73]:
n_gram_counts_list = [unigram_counts_blog, bigram_counts_blog, trigram_counts_blog, quadgram_counts_blog]
start_withs = None
count = 0

for i in tqdm(range(0, 100)):
    testing = test_data_processed_blog[i][0:len(test_data_processed_blog[i])-1]
    # print(testing)
    tmp_suggest = get_suggestions(testing, n_gram_counts_list, unique_words_blog, start_withs, k=1.0)
    for j in tmp_suggest:
        if test_data_processed_blog[i][len(test_data_processed_blog[i])-1] == j[0]:
            count = count + 1
print(count)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


18
