In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/NLP/Project/


/content/drive/MyDrive/NLP/Project


In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 3.1 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 9.0 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 40.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 41.7 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 7.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |██████████████████████

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, InputExample, SentencesDataset, losses, util
import scipy
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import torch
from datetime import datetime
import seaborn as sns
import os
import gzip
import csv

In [None]:
data = 'Data/final_data.csv'
model_save_path = 'models/model'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
df = pd.read_csv(data)
df.head()

Unnamed: 0.1,Unnamed: 0,title,content
0,0,Você sabe um país para cada letra do alfabeto?,Você sabe um país para cada letra do alfabeto?...
1,1,If You’ve Tasted 21/52 Of These International ...,Food | If You’ve Tasted 21/52 Of These Interna...
2,2,16 Twenties Vs. Thirties Tweets That Are So Ac...,16 Twenties Vs. Thirties Tweets That Are So Ac...
3,3,The 19 Most Tone-Deaf Things Celebrities Have ...,Celebrity | The 19 Most Tone-Deaf Things Celeb...
4,4,"If You're Bored, Try Matching These Disney Pri...","TV and Movies | If You're Bored, Try Matching ..."


In [None]:
df=df.drop(df.columns[0], axis =1)

In [None]:
df.head()

Unnamed: 0,title,content
0,Você sabe um país para cada letra do alfabeto?,Você sabe um país para cada letra do alfabeto?...
1,If You’ve Tasted 21/52 Of These International ...,Food | If You’ve Tasted 21/52 Of These Interna...
2,16 Twenties Vs. Thirties Tweets That Are So Ac...,16 Twenties Vs. Thirties Tweets That Are So Ac...
3,The 19 Most Tone-Deaf Things Celebrities Have ...,Celebrity | The 19 Most Tone-Deaf Things Celeb...
4,"If You're Bored, Try Matching These Disney Pri...","TV and Movies | If You're Bored, Try Matching ..."


In [None]:
# Loading pre-trained model
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1', device='cuda')

In [None]:
model.to('cuda')

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [None]:
# Get text embeddings for each column
def get_embed(df, model):
  embed_df = pd.DataFrame(columns=['titleEmbed', 'contentEmbed'])

  for col in df:
      sentences = df[col].values.tolist()
      colname = col+"Embed"
      embed_df[colname] = [(model.encode(sentences)) for sentences in df[col]]
  return embed_df

In [None]:
embeddings_df = get_embed(df, model)
embeddings_df.head()

Unnamed: 0,titleEmbed,contentEmbed
0,"[-0.05714297, -0.23945381, -0.055105474, 0.060...","[-0.06427611, -0.22729957, -0.04783242, 0.0602..."
1,"[-0.056566045, -0.21227367, -0.047760163, 0.05...","[-0.08923025, -0.20196928, -0.037119754, 0.065..."
2,"[-0.050068036, -0.21959588, -0.04476309, 0.054...","[-0.074253336, -0.21098608, -0.038652975, 0.05..."
3,"[-0.036889568, -0.18872297, -0.04491199, 0.039...","[-0.085081406, -0.2069476, -0.036814198, 0.053..."
4,"[-0.014576565, -0.18567435, -0.04128795, 0.057...","[-0.056013476, -0.20482188, -0.041197047, 0.05..."


In [None]:
embed_df.shape

(1500, 3)

In [None]:
# Calculate similarity scores based on desired metric
def get_similarity(similarity, embed_df):
  sim_df = pd.DataFrame(columns=['titleEmbed', 'contentEmbed', 'dot'])

  i=0
  for title in embed_df['titleEmbed']:
      for content in embed_df['contentEmbed']:
        sim_df.at[i, 'titleEmbed'] = title
        sim_df.at[i, 'contentEmbed'] = content
        if similarity == "dot":
          sim_df.at[i, similarity] = float(util.dot_score([title], [content])[0])
        elif similarity == "cosine":
          sim_df.at[i, similarity] = float(util.cos_sim([title], [content])[0])
        i += 1
        if i > 80000:
          break
  return sim_df

In [None]:
dot_df = get_similarity("dot", embed_df)

In [None]:
dot_df.shape

(81447, 3)

In [None]:
# Compiling training dataset
train_examples = []

ctr = 0

for index, row in dot_df.iterrows():
    input = InputExample(texts=[row[0], row[1]], label = row[2]) # texts = [title, content], label = (dot/cosine)score
    if ctr%2 == 0:
        train_examples.append(input)
    ctr += 1

In [None]:
train_dataset = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)


#Tuning the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100, output_path=model_save_path)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5071 [00:00<?, ?it/s]

In [None]:
model = SentenceTransformer('models/model2021-11-30_05-16-24')

In [None]:
df['text'] = df['title'] + df['content']
sentences = df['text'].values.tolist()
sentence_embeddings = model.encode(sentences)
sentence_embeddings

array([[-0.06579898, -0.2263086 , -0.0462783 , ..., -0.04309922,
        -0.06827192, -0.05653097],
       [-0.08882518, -0.20710036, -0.03661259, ..., -0.04093271,
        -0.06884082, -0.0565644 ],
       [-0.0770442 , -0.21066673, -0.03708997, ..., -0.04754854,
        -0.07090177, -0.0575573 ],
       ...,
       [-0.08888742, -0.16053383, -0.02033036, ..., -0.08305106,
        -0.02201203, -0.03985842],
       [-0.05158786, -0.177235  , -0.02396799, ..., -0.07936928,
        -0.08268517, -0.03728066],
       [-0.02711055, -0.21812339, -0.01123542, ..., -0.05721707,
        -0.03687089, -0.03999661]], dtype=float32)

In [None]:
query = 'actrss brekup' #@param {type: 'string'}
queries = [query]
query_embeddings = model.encode(queries)

number_top_matches = 8 #@param {type: "number"}

print("Semantic Search Results after training: ")

for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], sentence_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])
    #print(type(results))

    print("\n-----------------\n")
    print("Query:", query)
    print("\nTop " + str(number_top_matches)+ " most similar news headlines: \n")

    for idx, distance in results[0:number_top_matches]:
        print(sentences[idx][:100], "(Cosine score: %.4f)" % (1-distance))

Semantic Search Results after training: 

-----------------

Query: actrss brekup

Top 8 most similar news headlines: 

16 coisas para te ajudar a sobreviver a um dia de chuvaShopping | 16 coisas para te ajudar a sobrevi (Cosine score: 0.9996)
Você sabe um país para cada letra do alfabeto?Você sabe um país para cada letra do alfabeto? | A úni (Cosine score: 0.9996)
18 People Who Are Having A Way, Wayyyy Worse Time Stuck Indoors Than You18 People Who Are Having A W (Cosine score: 0.9996)
Você consegue diferenciar todas estas celebridades?Você consegue diferenciar todas estas celebridade (Cosine score: 0.9996)
Croque monsieur grandãoCroque monsieur grandão | Um clássico francês! | publicado March 20, 2020, 19 (Cosine score: 0.9996)
19 Dinge, die du nur verstehst, wenn du mit Katzen aufgewachsen bist19 Dinge, die du nur verstehst,  (Cosine score: 0.9996)
If You Don't Pass This Month Quiz, You'll Be EmbarrassedIf You Don't Pass This Month Quiz, You'll Be (Cosine score: 0.9996)
Responda ess