In [8]:
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import normalize
import numpy as np
import json

In [9]:
with open("C360_updated.json") as f:
    doc = json.load(f)

In [15]:
doc

{'title': 'Customer_360',
 'description': 'Customer data from various sources, enabling a holistic view of customers.',
 'owners': 'Ishmin Singh',
 'version': 'v1.1',
 'version_date': 'July 17, 2023',
 'tags': ['marketing', 'customer_360'],
 'health': {'quality_score': '70%', 'operational_health': '70%'},
 'data_sources': {'table': [{'title': 'customers',
    'description': 'Contains customer information, including customer_index, name, contact details, and other relevant attributes.',
    'version': 'v2.0',
    'version_date': 'july 15, 2023',
    'workflow': {'title': 'Customer wrokflow',
     'description': 'Automated workflow for processing and analyzing customer data',
     'datasets': {'source': '3rdparty', 'source_name': 'customer_data.csv'},
     'streaming': False,
     'last_run': 'November 18, 2023'},
    'quality_score': '78%',
    'completeness': '95%',
    'recency': 'Daily',
    'frequency': 'Real-time',
    'schema': [{'title': 'customer_id', 'data_type': 'string'},
   

In [10]:
nltk.download("stopwords")
nltk.download("wordnet")

bert_model = SentenceTransformer('all-MiniLM-L6-v2')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/j/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/j/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
def preprocess_text(text):
    tokens = text.split()
    tokens = [word for word in tokens if word.lower() not in stop_words]
    
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [12]:
def vectorize_text(text):
    preprocessed_text = preprocess_text(text)
    
    vector = bert_model.encode(preprocessed_text, convert_to_tensor=True)
    
    return vector


In [13]:
def normalize_vector(vector):
    normalized_vector = vector / np.linalg.norm(vector)
    
    return normalized_vector

In [18]:
text =doc['tags'] 
title_vec = vectorize_text(text)
normalized_vector_title = normalize_vector(title_vec)
print(normalized_vector_title)

AttributeError: 'list' object has no attribute 'split'

In [16]:
text =doc['title'] 
title_vec = vectorize_text(text)
normalized_vector_title = normalize_vector(title_vec)
print(normalized_vector_title)

tensor([-7.5098e-02, -3.2001e-02, -3.4372e-02, -5.4321e-02,  9.1698e-03,
        -4.1828e-02,  1.0451e-01,  9.1807e-03,  1.7489e-02, -6.0192e-02,
         7.1247e-02, -2.0161e-02,  3.4009e-02, -2.0237e-02,  3.4822e-02,
        -2.5172e-02, -5.6595e-03,  2.7537e-02, -1.0023e-01,  6.7300e-03,
        -1.4080e-01,  2.0834e-03, -6.5475e-02,  5.3364e-02, -5.4745e-02,
         4.1959e-02,  7.7322e-03,  5.5533e-02, -5.2701e-02, -4.4167e-02,
         5.1935e-03,  3.2325e-02,  1.1942e-01,  6.2033e-03,  4.2735e-02,
        -2.7671e-02, -6.3851e-02, -2.6104e-02, -8.2532e-02, -1.0414e-02,
        -5.9735e-02, -7.5617e-02, -2.5325e-02,  2.3954e-02,  4.5534e-02,
         2.6585e-02, -6.1936e-03,  3.4197e-02,  8.6103e-02,  5.2832e-02,
        -6.5413e-02, -8.9018e-03, -4.5874e-02,  3.3694e-02, -4.0460e-02,
         3.3844e-03, -2.2014e-02, -3.4824e-03,  2.9436e-02,  2.8088e-02,
        -9.4455e-03, -4.2134e-02,  6.6283e-03,  6.0601e-02, -3.4527e-02,
        -4.5290e-02, -1.2326e-01, -4.6454e-02, -3.2

In [17]:
text =doc['description']
des_vec = vectorize_text(text)
normalized_vector_des = normalize_vector(des_vec)
print(normalized_vector_des)

tensor([-2.6456e-02, -3.0525e-03, -4.9228e-03, -3.9669e-02, -1.7772e-02,
        -5.4842e-03,  2.6563e-02, -4.5115e-02, -6.9871e-02, -2.5495e-02,
         7.1775e-02,  6.1221e-02,  1.4985e-02, -6.0846e-02,  6.2930e-02,
         8.1881e-03,  8.8664e-02,  3.0924e-03, -2.5710e-02, -2.3845e-02,
        -1.1893e-01, -2.1062e-02, -1.2346e-01,  3.8047e-02,  2.7591e-03,
         4.7724e-02,  5.5189e-02,  3.8505e-04,  1.4941e-02, -3.5568e-02,
         3.1782e-02,  8.8407e-02,  7.4283e-02,  4.9695e-02, -6.9738e-02,
        -7.4827e-03, -3.1138e-02,  6.2846e-02, -1.1515e-02, -8.5587e-04,
        -2.6534e-02,  1.7358e-02, -9.8629e-02,  3.4224e-02,  2.7103e-02,
        -3.6325e-02,  7.7362e-03, -9.8653e-03,  7.5621e-03,  8.8073e-02,
        -1.9161e-01, -8.0606e-05,  3.5080e-03,  2.6441e-02,  8.1978e-04,
         1.2277e-02, -3.6256e-02, -2.7674e-02,  8.1362e-03,  5.9868e-02,
        -2.2137e-02, -6.2050e-02, -5.7038e-02,  8.3038e-02,  5.3858e-03,
         2.3126e-02, -3.3060e-02,  1.8831e-02, -2.1

In [9]:
health_vec = {
    "title_vec": normalized_vector_title.tolist(),
    "description_vec": normalized_vector_des.tolist()
}

In [10]:
combined_heatlth = doc

In [12]:
combined_heatlth.update(health_vec)

In [13]:
with open('combined_health.json', 'w') as output_file:
    json.dump(combined_heatlth, output_file, indent=4)