### How to use

1. install export chrome history:
https://chrome.google.com/webstore/detail/export-chrome-history/dihloblpkeiddiaojbagoecedbfpifdj?hl=en

2. export one week worth of history as csv:

![alt text](print.png "Title")

3. press Ctrl-F9 to run all cells o this notebook (will take a while)

4. Upload the csv file:

![alt text](print2.png "Title")

In [None]:
%%capture
!pip3 install annoy

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline


import tensorflow as tf
import tensorflow_hub as hub

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

#interactive stuff
import io
from IPython.display import clear_output
from ipywidgets import interactive, FileUpload, Output

from annoy import AnnoyIndex

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\work\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\work\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def generate_embeddings(messages_in):
     return embed(messages_in).numpy()

In [None]:
def clear_data(df):
    df['text'] = df['title']
    
    df.dropna(
        axis=0,
        how='any',
        thresh=None,
        subset=['text'],
        inplace=True
    )

    df.reset_index(inplace=True,drop=True)
    df['index'] = df.index
    
    #clear non meaningful words
    import re

    noiseWords = ['Google Search', '|', '%', '.', ' — ', '/']
    big_regex = re.compile('|'.join(map(re.escape, noiseWords)))
    df['text'] = df['text'].apply(lambda x : big_regex.sub("", x) )

    def remove_non_nouns(lines):
        is_noun = lambda pos: pos[:2] == 'NN'
        tokenized = nltk.word_tokenize(lines)
        return ' '.join( [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] )

    df['text'] = df['text'].apply(remove_non_nouns);
    
    #remove duplicates, NaNs etc
    df.drop_duplicates(subset=['text'],inplace=True)

    df.dropna(
        axis=0,
        how='any',
        thresh=None,
        subset=['text'],
        inplace=True
    )

    df.reset_index(inplace=True,drop=True)
    df['index'] = df.index
    
    return df

In [123]:
def append_most_common_sentence(df):
    #get word frequency
    tokens = nltk.tokenize.word_tokenize(' '.join(df["text"]))
    freq = nltk.FreqDist(tokens)
    
    compare_to_top = 10
    top_n_words_sentence = ' '.join([i[0] for i in freq.most_common(compare_to_top)])
    return df.append({'text': top_n_words_sentence, 'type': 1}, ignore_index=True)

In [124]:
def sort_by_narest_neighbor(df, use_embeddings):
    #calculat the distance between USE embeddings
    nn_tree = AnnoyIndex(512, 'euclidean')
    for idx, e in enumerate(use_embeddings):
        nn_tree.add_item(idx, e)
    nn_tree.build(10)
    
    idxs = nn_tree.get_nns_by_item(df.last_valid_index(), len(df))
    
    idxs.reverse() #get furthest neighhbors
    
    return df.reindex(idxs)

In [146]:
outFileUpload = Output(layout={'border': '1px solid black', 'height': '70px'})
out = Output(layout={'border': '1px solid black', 'height': '400px', 'overflow': 'scroll'})

uploader = FileUpload(
    accept='.csv,.txt',
    multiple=False
)

with outFileUpload:
    pd.set_option("display.max_rows", None)
    display(uploader)

@out.capture()
def on_upload_change(change):
    df = pd.read_csv(io.BytesIO(change['owner'].data[0]))

    out.clear_output(wait=True)
    
    if not 'title' in df:
        return display("error: csv file needs to have a 'title' column")
    
    df = clear_data(df)
    df = append_most_common_sentence(df)
    
    compSentence = df.tail(1)
    
    use_embeddings = generate_embeddings(df['text'])
    
    df = sort_by_narest_neighbor(df, use_embeddings)
    
    df.drop(compSentence.index, inplace=True)
    
    display("comparing sentences to '" + compSentence['text'].values[0] + "''")
    
    display(df[['date', 'time', 'title']])
        
uploader.observe(on_upload_change, names='_counter')

display(outFileUpload)
display(out)

Output(layout=Layout(border='1px solid black', height='70px'))

Output(layout=Layout(border='1px solid black', height='400px', overflow='scroll'))