In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
df= pd.read_csv("/content/2015_2_clickstream.tsv", sep="\t")[0:1000000]

In [None]:
df.head()

Unnamed: 0,prev_id,curr_id,n,prev_title,curr_title,type
0,,3632887.0,121,other-google,!!,other
1,,3632887.0,93,other-wikipedia,!!,other
2,,3632887.0,46,other-empty,!!,other
3,,3632887.0,10,other-other,!!,other
4,64486.0,3632887.0,11,!_(disambiguation),!!,other


In [None]:
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
data= df.drop(["prev_id", "curr_id", "n", "curr_title"], axis=1)
data.head()

Unnamed: 0,prev_title,type
0,other-google,other
1,other-wikipedia,other
2,other-empty,other
3,other-other,other
4,!_(disambiguation),other


In [None]:
data["prev_title"]= data["prev_title"].astype("str")

In [None]:
def clean_text(d):
    all_reviews = list()
    lines = d["prev_title"].values.tolist()
    for text in lines:
        text = text.lower()
        
        pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text = pattern.sub('', text)
        
        emoji = re.compile("["
                           u"\U0001F600-\U0001FFFF"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        text = emoji.sub(r'', text)
        
        text = re.sub(r"i'm", "i am", text)
        text = re.sub(r"he's", "he is", text)
        text = re.sub(r"she's", "she is", text)
        text = re.sub(r"that's", "that is", text)        
        text = re.sub(r"what's", "what is", text)
        text = re.sub(r"where's", "where is", text) 
        text = re.sub(r"\'ll", " will", text)  
        text = re.sub(r"\'ve", " have", text)  
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"won't", "will not", text)
        text = re.sub(r"don't", "do not", text)
        text = re.sub(r"did't", "did not", text)
        text = re.sub(r"can't", "can not", text)
        text = re.sub(r"it's", "it is", text)
        text = re.sub(r"couldn't", "could not", text)
        text = re.sub(r"have't", "have not", text)
        
        text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", " ", text)
        tokens = word_tokenize(text)
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
#         stop_words = set(stopwords.words("english"))
#         stop_words.discard("not")
#         words = [w for w in words if not w in stop_words]
        words = ' '.join(words)
        all_reviews.append(words)
    return all_reviews

all_reviews = clean_text(data)
all_reviews[0:20]

['other google',
 'other wikipedia',
 'other empty',
 'other other',
 'disambiguation',
 'loudenupnow',
 'other empty',
 'other google',
 'other wikipedia',
 'disambiguation',
 '',
 'other empty',
 'hero',
 'other wikipedia',
 'other google',
 'jerichorosales',
 'listoftelenovelasofabs cbn',
 'other google',
 'other wikipedia',
 'other empty']

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 

In [None]:
model = hub.load(module_url)

In [None]:
sentence_embeddings = model(all_reviews)
query = "I had pizza and pasta"
query_vec = model([query])[0]

In [None]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [None]:
for sent in all_reviews:
  sim = cosine(query_vec, model([sent])[0])
  print("Sentence = ", sent, "; similarity = ", sim)

Sentence =  other google ; similarity =  -0.013600783
Sentence =  other wikipedia ; similarity =  0.022668375
Sentence =  other empty ; similarity =  0.019568266
Sentence =  other other ; similarity =  0.030498616
Sentence =  disambiguation ; similarity =  -0.020961016
Sentence =  loudenupnow ; similarity =  0.09295138
Sentence =  other empty ; similarity =  0.019568266
Sentence =  other google ; similarity =  -0.013600783
Sentence =  other wikipedia ; similarity =  0.022668375
Sentence =  disambiguation ; similarity =  -0.020961016
Sentence =   ; similarity =  0.054741997
Sentence =  other empty ; similarity =  0.019568266
Sentence =  hero ; similarity =  0.053547896
Sentence =  other wikipedia ; similarity =  0.022668375
Sentence =  other google ; similarity =  -0.013600783
Sentence =  jerichorosales ; similarity =  0.036781814
Sentence =  listoftelenovelasofabs cbn ; similarity =  0.010826356
Sentence =  other google ; similarity =  -0.013600783
Sentence =  other wikipedia ; similar