In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re
import spacy
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
RANDOM_STATE = 42

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
dataset = load_dataset("medical_questions_pairs")

In [3]:
df = pd.DataFrame(dataset['train'])

In [5]:
df_q = pd.concat([df['question_1'], df['question_2']], axis=0)

In [6]:
df_q = df_q.reset_index(drop=True)

In [7]:
df_q.isna().sum()

0

In [8]:
df_q.duplicated().sum()

1529

In [9]:
df_q = df_q.drop_duplicates().reset_index(drop=True)

In [10]:
df_q = pd.DataFrame(df_q, columns=['text'])

In [11]:
df_q

Unnamed: 0,text
0,After how many hour from drinking an antibioti...
1,Am I over weight (192.9) for my age (39)?
2,Aspirin allergy - is it worth getting a bracelet?
3,"At a doctor's visit, I hit my head against a b..."
4,Been on antibiotics 4 5wks top high tooth dent...
...,...
4562,My sperm count is 15 millions and is there a c...
4563,"Hello doctor, can you please tell me some of t..."
4564,"Hello doctor, I am 26 year old male wth progre..."
4565,I am 32 weeks pregnant and had severe headache...


In [12]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatize(text):
    global nlp
    text = (re.sub(r'[^a-z0-9]+', ' ', text.lower()))
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

In [13]:
%%time
tqdm.pandas()
df_q['lemma_s'] = df_q['text'].progress_apply(lemmatize)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4567/4567 [00:10<00:00, 446.20it/s]

CPU times: user 10.2 s, sys: 53.1 ms, total: 10.2 s
Wall time: 10.2 s





In [14]:
df_q

Unnamed: 0,text,lemma_s
0,After how many hour from drinking an antibioti...,after how many hour from drink an antibiotic c...
1,Am I over weight (192.9) for my age (39)?,be I over weight 192 9 for my age 39
2,Aspirin allergy - is it worth getting a bracelet?,aspirin allergy be it worth get a bracelet
3,"At a doctor's visit, I hit my head against a b...",at a doctor s visit I hit my head against a bo...
4,Been on antibiotics 4 5wks top high tooth dent...,be on antibiotic 4 5wks top high tooth dentist...
...,...,...
4562,My sperm count is 15 millions and is there a c...,my sperm count be 15 million and be there a ch...
4563,"Hello doctor, can you please tell me some of t...",hello doctor can you please tell I some of the...
4564,"Hello doctor, I am 26 year old male wth progre...",hello doctor I be 26 year old male wth progres...
4565,I am 32 weeks pregnant and had severe headache...,I be 32 week pregnant and have severe headache...


In [15]:
stopwords = list(nltk_stopwords.words('english'))

In [16]:
vectorizer = TfidfVectorizer(stop_words=stopwords)

In [17]:
df_vec = vectorizer.fit_transform(df_q['lemma_s'])

In [18]:
df_vec

<4567x4948 sparse matrix of type '<class 'numpy.float64'>'
	with 42859 stored elements in Compressed Sparse Row format>

In [25]:
df_q.loc[3, 'text']

"At a doctor's visit, I hit my head against a box on the wall containing hazardous materials (used syringes, used needles...). Will I get infected?"

In [19]:
check_sentence = vectorizer.transform([df_q.loc[3, 'text']])

In [28]:
check_sentence

<1x4948 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [20]:
df_vec

<4567x4948 sparse matrix of type '<class 'numpy.float64'>'
	with 42859 stored elements in Compressed Sparse Row format>

In [21]:
df_q['dist'] = cosine_similarity(df_vec, check_sentence)

In [22]:
df_q[df_q['dist'] == 0]

Unnamed: 0,text,lemma_s,dist
0,After how many hour from drinking an antibioti...,after how many hour from drink an antibiotic c...,0.0
1,Am I over weight (192.9) for my age (39)?,be I over weight 192 9 for my age 39,0.0
5,Can Adderall (dextroamphetamine and racemic am...,can adderall dextroamphetamine and racemic amp...,0.0
6,Can coarctation of the aorta cause poor growth...,can coarctation of the aorta cause poor growth...,0.0
7,Can doxycycline treat an ear infection?,can doxycycline treat an ear infection,0.0
...,...,...,...
4555,Can sexual intercourse stimulate one's appetit...,can sexual intercourse stimulate one s appetit...,0.0
4556,What could be causing me to experience symptom...,what could be cause I to experience symptom li...,0.0
4557,Do you think being stressed for a year or so c...,do you think be stress for a year or so can ha...,0.0
4559,Is it normal for a 24 year old female to sweat...,be it normal for a 24 year old female to sweat...,0.0


In [23]:
df_q.loc[3, 'text']

"At a doctor's visit, I hit my head against a box on the wall containing hazardous materials (used syringes, used needles...). Will I get infected?"

In [24]:
df[df.question_1 == df_q.loc[3, 'text']]

Unnamed: 0,dr_id,question_1,question_2,label
6,1,"At a doctor's visit, I hit my head against a b...",Is it okay if I use the same syringe for my in...,0
7,1,"At a doctor's visit, I hit my head against a b...","Today morning, I had an appointment with the d...",1
