In [93]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline


#import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
tf.reset_default_graph()
import tensorflow_hub as hub
import tf_sentencepiece

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

#interactive stuff
import io
from IPython.display import clear_output
from ipywidgets import interactive, FileUpload, Output

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\work\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\work\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [None]:
try:
    #teste if gpu is enabled
    tf.compat.v1.disable_eager_execution()

    with tf.device('/gpu:0'):
        a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
        b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
        c = tf.matmul(a, b)

    with tf.Session() as sess:
        print (sess.run(c))
    # should be
    #   [[22. 28.]
    #   [49. 64.]]
except:
    print("no GPU, ah :(")

In [None]:
#Universal Sentence encoder (USE)
use_module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/1"

g = tf.Graph()
with g.as_default():
    text_input = tf.placeholder(dtype=tf.string, shape=[None])
    embed_module = hub.Module(use_module_url)
    embedded_text = embed_module(text_input)
    init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
g.finalize()

config = tf.ConfigProto(inter_op_parallelism_threads=1,
                   intra_op_parallelism_threads=1)
config.gpu_options.allow_growth = True

session = tf.Session(graph=g, config=config)

session.run(init_op)



def generate_embeddings(messages_in):
    return session.run(embedded_text, feed_dict={text_input: messages_in})

In [122]:
def clear_data(df):
    #remove duplicates, NaNs etc
    df.drop_duplicates(subset=['title'],inplace=True)

    df.dropna(
        axis=0,
        how='any',
        thresh=None,
        subset=['title'],
        inplace=True
    )

    df.reset_index(inplace=True,drop=True)
    df['index'] = df.index
    
    #clear non meaningful words
    import re

    noiseWords = ['Google Search', '|', '%', '.', ' — ', '/']
    big_regex = re.compile('|'.join(map(re.escape, noiseWords)))
    df['title'] = df['title'].apply(lambda x : big_regex.sub("", x) )

    def remove_non_nouns(lines):
        is_noun = lambda pos: pos[:2] == 'NN'
        tokenized = nltk.word_tokenize(lines)
        return ' '.join( [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] )

    df['title'] = df['title'].apply(remove_non_nouns);
    
    return df

In [123]:
def append_most_common_sentence(df):
    #get word frequency
    tokens = nltk.tokenize.word_tokenize(' '.join(df["title"]))
    freq = nltk.FreqDist(tokens)
    
    compare_to_top = 10
    top_n_words_sentence = ' '.join([i[0] for i in freq.most_common(compare_to_top)])
    return df.append({'title': top_n_words_sentence, 'type': 1}, ignore_index=True)

In [124]:
def sort_by_narest_neighbor(df, use_embeddings):
    #return df
    #calculat the distance between USE embeddings
    from annoy import AnnoyIndex
    nn_tree = AnnoyIndex(512, 'euclidean')
    for idx, e in enumerate(use_embeddings):
        nn_tree.add_item(idx, e)
    nn_tree.build(10)
    
    idxs = nn_tree.get_nns_by_item(df.last_valid_index(), len(df))
    
    idxs.reverse() #get furthest neighhbors
    
    return df.reindex(idxs)

In [146]:
outFileUpload = Output(layout={'border': '1px solid black', 'height': '70px'})
out = Output(layout={'border': '1px solid black', 'height': '400px', 'overflow': 'scroll'})

uploader = FileUpload(
    accept='.csv,.txt',
    multiple=False
)

with outFileUpload:
    pd.set_option("display.max_rows", None)
    display(uploader)

@out.capture()
def on_upload_change(change):
    df = pd.read_csv(io.BytesIO(change['owner'].data[0]))

    out.clear_output(wait=True)
    
    if not 'title' in df:
        return display("error: csv file needs to have a 'title' column")
    
    df = clear_data(df)
    df = append_most_common_sentence(df)
    
    compSentence = df.tail(1)
    
    use_embeddings = generate_embeddings(df['title'])
    #use_embeddings = np.load('embeddings.npy')
    df = sort_by_narest_neighbor(df, use_embeddings)
    
    df.drop(compSentence.index, inplace=True)
    
    display("comparing sentences to '" + compSentence['title'].values[0] + "''")
    
    display(df[['date', 'time', 'title']])
        
uploader.observe(on_upload_change, names='_counter')

display(outFileUpload)
display(out)

Output(layout=Layout(border='1px solid black', height='70px'))

Output(layout=Layout(border='1px solid black', height='400px', overflow='scroll'))