In [1]:
from google.cloud import aiplatform
import os

gemini = os.getenv("GEMINI_API_KEY")
location = os.getenv("location")
location_processor = os.getenv("location_processor")
project_id = os.getenv("project_id")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= 'preprocessing_credentials.json'

aiplatform.init(project=project_id, location=location)

In [6]:
import json
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

# Function to embed text using Vertex AI's Text Embedding Model
def embed_text_vertex(data):
    """
    Embeds text using Vertex AI's Text Embedding Model and updates the embeddings in the input data.
    Args:
        data (list): List of dictionaries containing 'text' and 'embedding'.
    Returns:
        list: Updated data with embeddings.
    """
    print("Starting embedding process...")

    # Initialize the embedding model
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")
    print("Embedding model initialized.")

    for entry in data:
        print(entry)
        text = entry['text']
        print(f"Embedding text for ID {entry['id']}...")

        try:
            # Generate embedding
            input_obj = TextEmbeddingInput(text=text)
            response = model.get_embeddings([input_obj])
            embedding = response[0].values

            # Update the entry with the embedding
            entry["embedding"] = embedding
            print(f"Successfully embedded text for ID {entry['id']}")
        except Exception as e:
            print(f"Error embedding text for ID {entry['id']}: {e}")
            entry["embedding"] = None

    print("Embedding process completed.")
    return data

# Main function
if __name__ == "__main__":
    # Path to the JSON file containing data
    input_file_path = "formatted_embeddings_input_nutrition.json"
    output_file_path = "embedded_results_nutrition.json"

    print(f"Loading data from {input_file_path}...")

    # Load the JSON file
    with open(input_file_path, "r") as input_file:
        data = json.load(input_file)
    
    print(f"Data loaded. Total entries: {len(data)}")

    # Embed the text data and update embeddings
    print("Embedding text data...")
    updated_data = embed_text_vertex(data)

    # Save the updated data with embeddings to a new JSON file
    with open(output_file_path, "w") as output_file:
        json.dump(updated_data, output_file, indent=4)

    print(f"Embedding process completed. Results saved to {output_file_path}.")


Loading data from formatted_embeddings_input_nutrition.json...
Data loaded. Total entries: 4154
Embedding text data...
Starting embedding process...
Embedding model initialized.
{'id': 1, 'text': 'Weight loss differences between isocaloric high-carbohydrate and high-fat diets are generally small, but individual weight loss varies substantially within diet groups.', 'embedding': None}
Embedding text for ID 1...
Successfully embedded text for ID 1
{'id': 2, 'text': 'Genotype patterns may modify diet effects, with carbohydrate-responsive genotypes potentially losing more weight on high-carbohydrate diets and fat-responsive genotypes potentially losing more weight on high-fat diets.', 'embedding': None}
Embedding text for ID 2...
Successfully embedded text for ID 2
{'id': 3, 'text': 'A study involving 145 participants with overweight/obesity, identified as fat-responders or carbohydrate-responders based on their combined genotypes at ten genetic variants, found no significant difference in

Old Method:

In [None]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

# Function to clean data
def clean_data(data):
    """
    Cleans the text data by removing duplicates, very basic text, and normalizing content.
    Args:
        data (list): List of text rows.
    Returns:
        list: Cleaned text rows.
    """
    print("Starting data cleaning...")
    
    # Convert list to DataFrame
    df = pd.DataFrame(data, columns=["text"])
    print(f"Initial data loaded. Total entries: {len(df)}")
    
    # Remove duplicates
    df = df.drop_duplicates(subset=["text"])
    print(f"Removed duplicates. Remaining entries: {len(df)}")
    
    # Remove short or basic text (less than 5 words or very generic)
    df = df[df["text"].apply(lambda x: len(x.split()) > 5)]
    print(f"Removed short/basic text. Remaining entries: {len(df)}")
    
    # Normalize text: remove special characters, extra spaces
    df["text"] = df["text"].str.replace(r"[^\w\s]", "", regex=True).str.strip()
    print("Normalized text.")
    
    # Optional: Remove highly similar rows using cosine similarity
    print("Calculating cosine similarity for identifying similar rows...")
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df["text"])
    similarity_matrix = cosine_similarity(tfidf_matrix)
    
    # Identify rows with high similarity
    to_remove = set()
    for i in range(len(similarity_matrix)):
        for j in range(i + 1, len(similarity_matrix)):
            if similarity_matrix[i, j] > 0.9:  # Threshold for similarity
                to_remove.add(j)
    
    print(f"Identified {len(to_remove)} similar rows to remove.")
    
    # Keep only unique rows
    df = df.iloc[list(set(range(len(df))) - to_remove)]
    print(f"Final cleaned data. Remaining entries: {len(df)}")
    
    return df["text"].tolist()

# Function to embed text using Vertex AI's Text Embedding Model
def embed_text_vertex(cleaned_text):
    """
    Embeds text using Vertex AI's Text Embedding Model.
    Args:
        cleaned_text (list): List of cleaned text rows.
    Returns:
        pd.DataFrame: DataFrame with original text and embeddings.
    """
    print("Starting embedding process...")

    # Initialize the embedding model
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")
    print("Embedding model initialized.")
    
    embeddings = []
    for idx, text in enumerate(cleaned_text):
        print(f"Embedding text {idx + 1}/{len(cleaned_text)}...")
        input_obj = TextEmbeddingInput(text=text)
        try:
            response = model.get_embeddings([input_obj])
            embedding = response[0].values
            embeddings.append(embedding)
            print(f"Successfully embedded text {idx + 1}")
        except Exception as e:
            print(f"Error embedding text {idx + 1}: {e}")
            embeddings.append(None)

    print("Embedding process completed.")
    
    # Create DataFrame with embeddings
    df_embeddings = pd.DataFrame({
        "text": cleaned_text,
        "embedding": embeddings
    })

    return df_embeddings

# Main function
if __name__ == "__main__":
    # Path to the JSON file created by the collection script
    input_file_path = "aggregated_results_resistant.json"
    print(f"Loading data from {input_file_path}...")

    # Load the JSON file
    with open(input_file_path, "r") as input_file:
        aggregated_results = json.load(input_file)
    
    print(f"Data loaded. Total findings: {len(aggregated_results['findings'])}")
    
    # Clean the findings
    print("Cleaning text data...")
    cleaned_findings = clean_data(aggregated_results["findings"])
    print(f"Cleaned findings: {len(cleaned_findings)} entries remain.")

    # Embed the cleaned text
    print("Embedding text data...")
    embedded_results = embed_text_vertex(cleaned_findings)
    
    # Save the embeddings to a new JSON file
    output_file_path = "embedded_results_nutrition.json"
    embedded_results.to_json(output_file_path, orient="records", indent=4)
    
    print(f"Embedding process completed. Results saved to {output_file_path}.")