  # <center><h4 style = "font-size: 40px; color:#C58917;">"Sentence Transformer & Cosine Similarity for Beginner</h4></center>
### <center>If you find this notebook useful, support with an upvote👍</center>

# All the important imports

In [None]:
import pandas as pd
import numpy as np

#wordcloud 
from wordcloud import WordCloud, ImageColorGenerator 


import tokenizers
from transformers import AutoTokenizer, AutoModel
import torch


#from tqdm import tqdm
from tqdm.autonotebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity

#plots
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [None]:
BASE_PATH = "/kaggle/input/us-patent-phrase-to-phrase-matching/"
train_df = pd.read_csv(BASE_PATH + "train.csv")
test_df = pd.read_csv(BASE_PATH + "test.csv")

In [None]:
train_df.head(10)

In [None]:
test_df.head()

# EDA

In [None]:
temp = train_df.groupby('score').count()['anchor'].reset_index().sort_values(by='score',ascending=False)
temp.style.background_gradient(cmap='Purples')

In [None]:
train_df.score.hist(color='#7fcdbb')
train_df.score.value_counts()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='score',data=train_df, palette= ["#7fcdbb","#edf8b1","#fc9272","#fee0d2","#bcbddc"])


In [None]:
fig = go.Figure(go.Funnelarea(
    text =temp.score,
    values = temp.anchor,
    title = {"position": "top center", "text": "Funnel-Chart of Score Distribution"}
    ))
fig.show()

In [None]:
train_df_temp = train_df

In [None]:
train_df_temp['Anchor_Num_words'] = train_df_temp['anchor'].apply(lambda x:len(str(x).split())) #Number Of words in anchor
train_df_temp['Target_Num_word'] = train_df_temp['target'].apply(lambda x:len(str(x).split())) #Number Of words in target
#train_df_temp['difference_in_words'] = train_df_temp['Target_Num_word_text'] - train_df_temp['Anchor_Num_words'] #Difference in Number of words text and Selected 

In [None]:
train_df_temp

In [None]:
train_df_temp.Anchor_Num_words.hist(color='#7fcdbb')

In [None]:
plt.figure(figsize=(12,12))
p1=sns.kdeplot(train_df_temp['Anchor_Num_words'], shade=True, color="g").set_title('Kernel Distribution of Number Of words')
p1=sns.kdeplot(train_df_temp['Target_Num_word'], shade=True, color="b")

In [None]:

def phrase_wordcloud(text,max_words=100, max_font_size=50, figure_size=(16,8), color = 'white',
                   title = None, title_size=40, image_color=True):
    
    plt.figure(figsize=figure_size , facecolor = None)
    words=' '.join([words for words in text])
    wordcloud = WordCloud(background_color=color,
                    #collocations=False,
                    max_words = max_words,
                    max_font_size = max_font_size, 
                    random_state = 42,
                    width=600, 
                    height=400).generate(words)

    plt.imshow(wordcloud);
    plt.title(title, fontdict={'size': title_size, 'color': 'black', 
                              'verticalalignment': 'bottom'})
    plt.axis('off');
    plt.tight_layout()  

In [None]:

phrase_wordcloud(train_df["anchor"].values,color='white',max_font_size=50,title_size=30,title="WordCloud of Anchors")

In [None]:
phrase_wordcloud(train_df["target"].values,color='white',max_font_size=50,title_size=30,title="WordCloud of Target")

# The Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

# Data Processing

In [None]:
# initialize dictionary: stores tokenized sentences

def encode_Phrase(phrase):
    token = {'input_ids': [], 'attention_mask': []}

    new_token = tokenizer(phrase, max_length=8,
                                           truncation=True, padding='max_length',
                                           return_tensors='pt')

    return new_token


# Data Loader

In [None]:
class PhraseDataset:
    def __init__(self, anchors, targets):

        self.anchors = anchors
        self.targets = targets

        
    def __len__(self):

        return len(self.anchors)
        
    def __getitem__(self, item):

        targets_embd = encode_Phrase(self.targets[item])
        anchors_embd = encode_Phrase(self.anchors[item])

        
        return {
            'anchors_ids': anchors_embd["input_ids"].detach().clone(), #input_ids
            'anchors_mask': anchors_embd["attention_mask"].detach().clone(), #attention_mask
            'targets_ids': targets_embd["input_ids"].detach().clone(), #input_ids
            'targets_mask': targets_embd["attention_mask"].detach().clone() #attention_mask
        }

# DataLoader + Training Data

In [None]:
train_dataset = PhraseDataset(
    anchors = train_df.anchor.values,
    targets = train_df.target.values
)



train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size= 32,
    num_workers=0
)

test_dataset = PhraseDataset(
    anchors = test_df.anchor.values,
    targets = test_df.target.values
)



test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size= 8,
    num_workers=0
)

# Embeddings + Cosine Similarity

In [None]:
def process_data(dataloader):
    scores = []
    tk = tqdm(dataloader, total=len(dataloader)) #tqdm is a Python library for adding progress bar. 

    for batch, data in enumerate(tk):

        anchors_ids = data['anchors_ids']
        anchors_mask = data["anchors_mask"]
        targets_ids = data["targets_ids"]
        targets_mask = data["targets_mask"]


        #adding the below data to device ;device enables you to specify the device type responsible to load a tensor into memory.
        anchors_ids = anchors_ids.to(DEVICE, dtype=torch.long)
        anchors_mask = anchors_mask.to(DEVICE, dtype=torch.long)
        targets_ids = targets_ids.to(DEVICE, dtype=torch.long)
        targets_mask = targets_mask.to(DEVICE, dtype=torch.long)

        anchors_output = model(torch.flatten(anchors_ids, start_dim=1),attention_mask=torch.flatten(anchors_mask, start_dim=1))[0]
        targets_output = model(torch.flatten(targets_ids, start_dim=1),attention_mask=torch.flatten(targets_mask, start_dim=1))[0]

        anchors_mean_pooled = torch.mean(anchors_output,axis=1).detach().cpu().numpy()
        targets_mean_pooled = torch.mean(targets_output,axis=1).detach().cpu().numpy()

        cosine_scores = cosine_similarity(anchors_mean_pooled, targets_mean_pooled)
        #print(cosine_scores)
        for i in (range(anchors_mean_pooled.shape[0])):
            scores.append(cosine_scores[i][i].item())
        
        #print(scores)
    return scores

In [None]:
scores = process_data(train_data_loader)
len(scores)

# Train data - Actual Score Vs Predicted Score

In [None]:
train_df['predicted_score'] = scores
train_df

# Test Evaluate 

In [None]:
test_df

In [None]:
test_scores = process_data(test_data_loader)
len(test_scores)
test_scores

In [None]:
test_df['predicted_score'] = test_scores
test_df

# Submission

In [None]:
submission_df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/sample_submission.csv')
submission_df.score = test_scores
submission_df.head()

In [None]:
#submission_df.to_csv('submission.csv', index=False)