#Prerequisetes:

In [8]:
!pip install requests tqdm faiss-cpu transformers tensorflow sentence-transformers textblob gensim



#Extracting and Storing Data:

In [9]:
import os
import requests
from pathlib import Path
from tqdm import tqdm
import zipfile

In [10]:
Dir=Path("./MIMIC_textdata")
def extracting_data(url, extract_to=Dir):

  #to create parent directories and ensure existing one
  extract_to.mkdir(parents=True, exist_ok=True)

  #Access the zip file
  zip_path=extract_to/"textbooks.zip"
  print("Downloading dataset...")
  response = requests.get(url,stream=True)
  with open(zip_path,'wb') as file:
    for chunk in tqdm(response.iter_content(chunk_size=1024), unit='KB'):
      if chunk:
        file.write(chunk)
  print("Extracting data....")

  with zipfile.ZipFile(zip_path,"r") as zip_ref:
    zip_ref.extractall(extract_to)
  print("Dataset has been extracted and dowloaded.")

data_url="https://www.dropbox.com/scl/fi/54p9kkx5n93bffyx08eba/textbooks.zip?rlkey=2y2c5x8y0uncnddichn9cmd7n&st=m290nmkk&dl=1"
extracting_data(data_url)

Downloading dataset...


88121KB [00:02, 42590.55KB/s]


Extracting data....
Dataset has been extracted and dowloaded.


#Preprocessing Data:

In [11]:
import re
from textblob import TextBlob
from gensim.utils import simple_preprocess

#load all text files into string format into a list
def load_files(dir_file):
  texts=[]
  for file_path in Path(dir_file).glob('*.txt'):
    with open(file_path,'r',encoding='utf-8') as file:
      texts.append(file.read())
  return texts

def cleaning_tokenizing_data(text):
  text=re.sub(r'\s+',' ',text) #removing extra spaces
  text=text.lower() #convert into lower case
  text=re.sub(r'[^a-zA-Z0-9\s]','',text) #removing special characters

  #tokenization
  tokens=simple_preprocess(text)
  return ' '.join(tokens)

def spelling(text):
  return(str(TextBlob(text).correct()))

def chunked_text(text, chunk_size=200):
  words=text.split()
  return [' '.join(words[i:i + chunk_size]) for i in range(0,len(words), chunk_size)]

documents=load_files(Dir/"textbooks/en")
cleaned_docs=[cleaning_tokenizing_data(doc) for doc in documents]
#corrected_docs=[spelling(doc) for doc in cleaned_docs]
chunked_docs=[]
for doc in cleaned_docs:
  chunked_docs.extend(chunked_text(doc))

print(f'Total documents chunks created: {len(chunked_docs)}')

Total documents chunks created: 60061


In [12]:
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
import numpy as np

print('Available devices:',tf.config.list_physical_devices('GPU'))

#Initializing Tokenizer and Model
checkpoint='sentence-transformers/all-MiniLM-L6-v2'
tokenizer=AutoTokenizer.from_pretrained(checkpoint)
model=TFAutoModel.from_pretrained(checkpoint)

def get_embeddings_in_batch(texts, batch_size=16):
  all_embeddings=[]
  for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i + batch_size]

    #tokenizing the batchs
    inputs = tokenizer(batch_texts, return_tensors='tf', truncation=True, padding=True, max_length=512)

    #embeddings of tokens
    output=model(inputs).last_hidden_state
    batch_embeddings=tf.reduce_mean(output, axis=1).numpy()

    all_embeddings.extend(batch_embeddings)
  return np.array(all_embeddings)

embeddings=get_embeddings_in_batch(chunked_docs, batch_size=128)
print(f"Generated embeddings for chucked documents: {len(embeddings)}")

Available devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['embeddings.position_ids']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Generated embeddings for chucked documents: 60061


#Initializing Vector Database:

In [13]:
import faiss
import numpy as np

dimension=384 #embedding size of model
index=faiss.IndexFlatL2(dimension)

#converting vector embeddings into array for FAISS
embedding_matrix = np.array([embedding.flatten() for embedding in embeddings]).astype('float32')

index.add(embedding_matrix)
print(f'Total embeddings stored:{index.ntotal}')

Total embeddings stored:60061


#Sample Verification:

In [15]:
def query_embedding(text):
  input=tokenizer(text, return_tensors='tf', truncation=True, padding=True)
  output=model(input).last_hidden_state
  return tf.reduce_mean(output, axis=1).numpy()

query='what causes of hairfall'
embeddings_sample=query_embedding(query)
embeddings_sample=np.array(embeddings_sample).reshape(1,-1).astype('float32')

k=5
distance, indices=index.search(embeddings_sample,5)

print('Top similar documents chunks:')
for ids in indices[0]:
  print(chunked_docs[ids])

Top similar documents chunks:
clinically significant distress or impairment in social occupational or other impor tant areas of functioning the term distress includes negative affects that may be experi enced by individuals with hair pulling such as feeling loss of control embarrassment and shame significant impairment may occur in several different areas of functioning eg social occupational academic and leisure in part because of avoidance of work school or other public situations hair pulling may be accompanied by range of behaviors or rituals involving hair thus individuals may search for particular kind of hair to pull eg hairs with specific tex ture or color may try to pull out hair in specific way eg so that the root comes out in tact or may visually examine or tactilely or orally manipulate the hair after it has been pulled eg rolling the hair between the fingers pulling the strand between the teeth bit ing the hair into pieces or swallowing the hair be triggered by feelings of