In [1]:
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import normalize
import numpy as np
import json

In [2]:
with open("Health_DP_Sample.json") as f:
    doc = json.load(f)

In [3]:
nltk.download("stopwords")
nltk.download("wordnet")

bert_model = SentenceTransformer('all-MiniLM-L6-v2')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/joby/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/joby/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def preprocess_text(text):
    tokens = text.split()
    tokens = [word for word in tokens if word.lower() not in stop_words]
    
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [5]:
def vectorize_text(text):
    preprocessed_text = preprocess_text(text)
    
    vector = bert_model.encode(preprocessed_text, convert_to_tensor=True)
    
    return vector


In [6]:
def normalize_vector(vector):
    normalized_vector = vector / np.linalg.norm(vector)
    
    return normalized_vector

In [7]:
text =doc['title'] 
title_vec = vectorize_text(text)
normalized_vector_title = normalize_vector(title_vec)
print(normalized_vector_title)

tensor([-4.4759e-02,  2.6807e-02, -3.1756e-02, -3.8253e-02,  1.1743e-02,
        -1.3167e-02,  9.5468e-02,  3.2305e-02,  3.4718e-02, -4.9148e-02,
         1.9468e-02, -3.6832e-02,  5.0836e-02, -3.3810e-02, -6.3641e-02,
        -8.0272e-02,  7.2007e-03, -1.9855e-02, -1.0068e-01,  2.3502e-02,
        -1.1980e-01,  1.3029e-01, -1.6273e-02,  5.1199e-02, -2.6417e-02,
         9.0379e-02, -6.7155e-03, -2.2542e-02, -2.5973e-02, -4.0634e-02,
         3.4524e-02,  5.2234e-03,  6.5907e-02,  1.0363e-02,  1.2069e-02,
        -2.3436e-02, -3.2667e-02, -3.3494e-02, -9.3401e-02, -1.4006e-02,
        -3.9347e-02, -5.4132e-02,  5.3397e-03,  5.4055e-03,  9.4815e-02,
         1.6475e-02, -3.4937e-02,  1.6269e-02,  6.6828e-02,  8.9474e-02,
        -7.7365e-02, -3.1951e-02, -1.9706e-02,  7.1317e-02, -1.4074e-02,
        -1.2214e-02, -8.0209e-02, -3.0453e-02, -4.7552e-02,  3.7486e-03,
        -1.8176e-02, -2.3505e-02,  3.8087e-02,  8.3917e-02,  5.9306e-04,
        -9.7659e-03, -2.5170e-02, -1.9283e-02, -3.5

In [8]:
text =doc['description']
des_vec = vectorize_text(text)
normalized_vector_des = normalize_vector(des_vec)
print(normalized_vector_des)

tensor([-5.1052e-02,  4.5607e-02, -4.5844e-02, -1.0701e-01, -5.8479e-02,
        -6.6257e-02, -6.0500e-02,  4.3353e-02, -5.6408e-02, -2.4659e-02,
        -5.5118e-02,  2.6802e-02,  1.0755e-04, -6.6141e-02, -3.4425e-02,
        -2.2008e-02, -1.2725e-02, -2.1297e-02, -3.7298e-02,  5.2698e-02,
        -1.2563e-01,  1.0736e-01, -4.6404e-02,  4.6559e-02,  8.7819e-03,
         6.2768e-02, -1.9248e-02, -3.7355e-02,  4.2828e-02, -5.3309e-02,
         1.7371e-02, -4.0365e-02,  2.7461e-02,  1.2474e-01, -3.3966e-02,
        -3.2097e-02, -5.1496e-02,  4.3404e-02, -6.2422e-02, -7.6605e-03,
        -1.8851e-02,  1.1985e-02, -3.7119e-02,  9.8176e-02,  3.7773e-02,
         3.9340e-02, -1.1511e-01, -2.3849e-02,  8.5075e-02,  8.4552e-02,
        -1.0786e-01, -2.4007e-02,  1.4387e-02,  7.0427e-02, -3.1121e-02,
         4.8883e-02, -3.3397e-02,  1.2507e-02, -9.6168e-02,  1.8430e-03,
        -1.0051e-02,  9.2619e-03,  7.8670e-02,  9.0292e-02, -6.6373e-02,
         5.9961e-02,  1.9992e-02,  1.6413e-02,  1.2

In [9]:
health_vec = {
    "title_vec": normalized_vector_title.tolist(),
    "description_vec": normalized_vector_des.tolist()
}

In [10]:
combined_heatlth = doc

In [12]:
combined_heatlth.update(health_vec)

In [13]:
with open('combined_health.json', 'w') as output_file:
    json.dump(combined_heatlth, output_file, indent=4)