# Finding Data Drift in Unstrucutred Text Data using HuggingFace Sentence Transformer

### Here we will find sentence embeddings vector for each baseline data point through sentence transformer and find Cosine Similarity each data point to find average of similarities, then we will introduce new data point from Pub Med dataset and find its cosine similarity with baseline dataset, and how averagely it is aparted from baseline to check drift in new data.

In [1]:
import pandas as pd
import numpy as np
import pickle
import ast

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

In [2]:
model = SentenceTransformer(r"sentence-transformers/paraphrase-MiniLM-L6-v2")

### Note Unzipping of dataset files are done already with separtae code

In [3]:
# importing Baseline Dataset
df = pd.read_csv("pubmed_abstracts.csv") # and particularly focusing on covid_19 column data
df.head()

Unnamed: 0.1,Unnamed: 0,deep_learning,covid_19,human_connectome,virtual_reality,brain_machine_interfaces,electroactive_polymers,pedot_electrodes,neuroprosthetics,deep_learning_links,covid_19_links,human_connectome_links,virtual_reality_links,brain_machine_interfaces_links,electroactive_polymers_links,pedot_electrodes_links,neuroprosthetics_links
0,0,(['Magnetic resonance spectroscopic imaging (M...,(['As cancer researchers shutter their labs to...,"(['For decades, it has been largely unknown to...",(['To evaluate the differences between walking...,(['All neural information systems (NIS) rely o...,(['A mediatorless glucose biosensor was develo...,(['In the growing field of brain-machine inter...,(['The heart continuously and cyclically commu...,https://www.ncbi.nlm.nih.gov/pubmed/31352337,https://www.ncbi.nlm.nih.gov/pubmed/32234716,https://www.ncbi.nlm.nih.gov/pubmed/25420254,https://www.ncbi.nlm.nih.gov/pubmed/30653920,https://www.ncbi.nlm.nih.gov/pubmed/27669264,https://www.ncbi.nlm.nih.gov/pubmed/22967516,https://www.ncbi.nlm.nih.gov/pubmed/28266832,https://www.ncbi.nlm.nih.gov/pubmed/31051293
1,1,(['Existing deep convolutional neural networks...,"(['In December 2019, the outbreak of pneumonia...",(['While resting-state functional magnetic res...,(['Potentially painful invasive procedures are...,(['Independent component analysis (ICA) as a p...,(['Hierarchical structures of hybrid materials...,(['High-performance transparent and flexible t...,(['This study was aimed at investigating the i...,https://www.ncbi.nlm.nih.gov/pubmed/31329133,https://www.ncbi.nlm.nih.gov/pubmed/32235387,https://www.ncbi.nlm.nih.gov/pubmed/25589760,https://www.ncbi.nlm.nih.gov/pubmed/30679136,https://www.ncbi.nlm.nih.gov/pubmed/27631789,https://www.ncbi.nlm.nih.gov/pubmed/23545560,https://www.ncbi.nlm.nih.gov/pubmed/28937733,https://www.ncbi.nlm.nih.gov/pubmed/30655080
2,2,(['Deep learning techniques have been increasi...,"([], 'Treating COVID-19 with Chloroquine.')",(['This paper presents the experimental evalua...,"([""Early exposure to radiological cross-sectio...","([], 'Brain-machine interfaces: assistive, tho...",(['An analytical method was researched for the...,"(['In this investigation, we employed a novel ...",(['Low-intensity focused ultrasound stimulatio...,https://www.ncbi.nlm.nih.gov/pubmed/31329567,https://www.ncbi.nlm.nih.gov/pubmed/32236562,https://www.ncbi.nlm.nih.gov/pubmed/25624185,https://www.ncbi.nlm.nih.gov/pubmed/30697948,https://www.ncbi.nlm.nih.gov/pubmed/27654684,https://www.ncbi.nlm.nih.gov/pubmed/22265536,https://www.ncbi.nlm.nih.gov/pubmed/28825302,https://www.ncbi.nlm.nih.gov/pubmed/30952150
3,3,(['The original article unfortunately containe...,"(['18 years ago, in 2002, the world was astoni...","([], ""For Microscopy special issue on 'connect...",(['To investigate the effects of various rehab...,(['While motor-imagery based brain-computer in...,(['The antibacterial properties of a nanocompo...,(['Great progress has been made on the cyclabi...,(['Our brain has developed a specific system t...,https://www.ncbi.nlm.nih.gov/pubmed/31350607,https://www.ncbi.nlm.nih.gov/pubmed/32235085,https://www.ncbi.nlm.nih.gov/pubmed/25652424,https://www.ncbi.nlm.nih.gov/pubmed/30686327,https://www.ncbi.nlm.nih.gov/pubmed/27578310,https://www.ncbi.nlm.nih.gov/pubmed/22091864,https://www.ncbi.nlm.nih.gov/pubmed/28306233,https://www.ncbi.nlm.nih.gov/pubmed/30685486
4,4,(['The most common applications of artificial ...,"([], 'Covid-19: Doctors still at ""considerable...",(['A central feature of theories of spatial na...,(['Virtual reality (VR) is a technology that a...,(['The disorders of consciousness refer to cli...,(['The metal-mediated self-assembly of coordin...,(['With the aim of a reliable biosensing exhib...,"([""Electrophysiological techniques have improv...",https://www.ncbi.nlm.nih.gov/pubmed/31348869,https://www.ncbi.nlm.nih.gov/pubmed/32234713,https://www.ncbi.nlm.nih.gov/pubmed/25601828,https://www.ncbi.nlm.nih.gov/pubmed/30668519,https://www.ncbi.nlm.nih.gov/pubmed/27590972,https://www.ncbi.nlm.nih.gov/pubmed/22624584,https://www.ncbi.nlm.nih.gov/pubmed/29201623,https://www.ncbi.nlm.nih.gov/pubmed/30564810


In [4]:
def parse_text(x):
    return ast.literal_eval(x)

sents = []
not_none_df = df['covid_19'][df['covid_19'].notna()]
for sent in not_none_df:
    y = parse_text(sent)
    if y[0] != []:
        sents.append(y[0][0])

In [5]:
# Getting sample data and removing some of chinese examples
sample_med_sents = sents[:10]
sample_med_sents.pop(7)
sample_med_sents

['As cancer researchers shutter their labs to comply with COVID-19-related work restrictions, some are turning their attention, resources, and technical know-how to the challenge of tackling the deadly coronavirus.',
 'In December 2019, the outbreak of pneumonia caused by a novel coronavirus, severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), has led to a serious pandemic in China and other countries worldwide. So far, more than 460,000 confirmed cases were diagnosed in nearly 190 countries, causing globally over 20,000 deaths. Currently, the epidemic is still spreading and there is no effective means to prevent the infection. Vaccines are proved to be the most effective and economical means to prevent and control infectious diseases. Several countries, companies, and institutions announced their programs and progress on vaccine development against the virus. While most of the vaccines are under design and preparation, there are some that have entered efficacy evaluation in 

In [6]:
# Finding vector representations for each sentence
med_embeddings = []
for text in sample_med_sents:
    med_embeddings.append(model.encode(text))

In [7]:
with open("med_sample_embeddings.pkl", "wb") as fp:   #Pickling
    pickle.dump(med_embeddings, fp)

In [12]:
# Getting ColA dataset for new set of points
df = pd.read_csv("cola_public/raw/in_domain_train.tsv", sep="\t", header=None)
df.head()

Unnamed: 0,0,1,2,3
0,gj04,1,,"Our friends won't buy this analysis, let alone..."
1,gj04,1,,One more pseudo generalization and I'm giving up.
2,gj04,1,,One more pseudo generalization or I'm giving up.
3,gj04,1,,"The more we study verbs, the crazier they get."
4,gj04,1,,Day by day the facts are getting murkier.


In [13]:
df = df[3].to_list()[:10] # taking 10 examples for quick checks for new data points
embeddings = []
for text in df:
    embeddings.append(model.encode(text))

In [14]:
# function to find cosine similarities
def calc_cos_sim(vec1, vec2):
    sim_score = cos_sim(vec1, vec2)
    return sim_score

In [15]:
# Calculating cosine similarities for baseline dataset
scores = []
for i in range(len(med_embeddings)):
    for j in range(len(med_embeddings)):
        if not (i == j):
            y = calc_cos_sim(med_embeddings[i], med_embeddings[j])
            scores.append(round(y.tolist()[0][0], 4))

In [16]:
scores

[0.4012,
 0.34,
 0.4053,
 0.3746,
 0.5438,
 0.3505,
 0.4403,
 0.1719,
 0.4012,
 0.685,
 0.5482,
 0.4119,
 0.5635,
 0.7755,
 0.6976,
 0.3709,
 0.34,
 0.685,
 0.4705,
 0.2618,
 0.5281,
 0.6645,
 0.5861,
 0.4317,
 0.4053,
 0.5482,
 0.4705,
 0.3291,
 0.4804,
 0.4687,
 0.3782,
 0.2168,
 0.3746,
 0.4119,
 0.2618,
 0.3291,
 0.5362,
 0.3377,
 0.3906,
 0.2518,
 0.5438,
 0.5635,
 0.5281,
 0.4804,
 0.5362,
 0.5585,
 0.6073,
 0.3581,
 0.3505,
 0.7755,
 0.6645,
 0.4687,
 0.3377,
 0.5585,
 0.7224,
 0.455,
 0.4403,
 0.6976,
 0.5861,
 0.3782,
 0.3906,
 0.6073,
 0.7224,
 0.4793,
 0.1719,
 0.3709,
 0.4317,
 0.2168,
 0.2518,
 0.3581,
 0.455,
 0.4793]

In [18]:
avg_score_baseline = sum(scores) / len(scores)
avg_score_baseline

0.4609166666666665

In [24]:
# lets save this score to evaluate new points based on this average score
def check_distance(new_score):
    global avg_score_baseline
    score_diff = avg_score_baseline - new_score
    

#check_distance(1)

0.4609166666666665


In [25]:
# Calculating cosine similarity of new data point with baseline data points
new_scores = []
for each in med_embeddings:
    y_new = calc_cos_sim(each, embeddings[0])
    y_new = round(y_new.tolist()[0][0], 4)
    new_scores.append(y_new)
    
new_datapoint_avg_score = sum(new_scores) / len(new_scores)
new_datapoint_avg_score

0.0897111111111111

In [26]:
# Lets try second example
new_scores = []
for each in med_embeddings:
    y_new = calc_cos_sim(each, embeddings[3])
    y_new = round(y_new.tolist()[0][0], 4)
    new_scores.append(y_new)
    
new_datapoint_avg_score = sum(new_scores) / len(new_scores)
new_datapoint_avg_score

0.040188888888888885