In [19]:
import os
import json

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= 'preprocessing_credentials.json'

In [20]:
from google.cloud import language_v2

# Initialize the Gemini API client
client = language_v2.LanguageServiceClient()

In [21]:
def get_sentence_embeddings(client, text):
    """
    Uses Gemini API to generate semantic insights (e.g., syntax tokens) for the text.
    """
    document = language_v2.Document(content=text, type_=language_v2.Document.Type.PLAIN_TEXT)
    response = client.analyze_syntax(document=document)
    
    # Extract sentences and their lengths as a proxy for semantic features
    sentences = [sentence.text.content for sentence in response.sentences]
    embeddings = [len(sentence) for sentence in sentences]  # Example: Use length as a basic embedding
    
    return sentences, embeddings


In [22]:
def semantic_chunking_with_gemini(client, text, threshold=50):
    """
    Splits text into semantic chunks based on the Gemini API's syntax analysis.
    """
    sentences, embeddings = get_sentence_embeddings(client, text)
    chunks = []
    current_chunk = []
    
    for i in range(len(sentences) - 1):
        current_chunk.append(sentences[i])
        
        # Calculate difference (e.g., absolute difference in sentence lengths as a proxy)
        difference = abs(embeddings[i] - embeddings[i + 1])
        
        # Split when the difference exceeds the threshold
        if difference > threshold:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
    
    # Append the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks


In [23]:
# Open the file in read mode and load its content into a variable
with open('combined_nutrition_papers.txt', 'r') as file:
    combined_nutrition_papers = file.read()

# Print the content
text = combined_nutrition_papers


# Perform semantic chunking
print("#### Semantic Chunking ####")
client = language_v2.LanguageServiceClient()
chunks = semantic_chunking_with_gemini(client, text, threshold=50)

# Print the resulting chunks
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i}:\n{chunk}\n")


#### Semantic Chunking ####


AttributeError: 'LanguageServiceClient' object has no attribute 'analyze_syntax'

In [25]:
from google.cloud import language_v1 as language_v2

def chunk_text(text, max_chunk_size=500):
  """
  Chunks a large text into smaller pieces suitable for Cloud Natural Language API.

  Args:
    text: The text to be chunked.
    max_chunk_size: The maximum size (in words) of each chunk. Defaults to 500.

  Returns:
    A list of text chunks, each not exceeding the maximum size.
  """

  chunks = []
  words = text.split()
  current_chunk = []
  for word in words:
    current_chunk.append(word)
    if len(current_chunk) >= max_chunk_size:
      chunks.append(" ".join(current_chunk))
      current_chunk = []
  # Add the last remaining chunk, if any
  if current_chunk:
    chunks.append(" ".join(current_chunk))

  return chunks

def get_sentence_embeddings(client, text):
    """
    Uses Gemini API to generate semantic insights (e.g., syntax tokens) for the text.
    """
    document = language_v2.Document(content=text, type_=language_v2.Document.Type.PLAIN_TEXT)
    response = client.analyze_syntax(document=document)
    
    # Extract sentences and their lengths as a proxy for semantic features
    sentences = [sentence.text.content for sentence in response.sentences]
    embeddings = [len(sentence) for sentence in sentences]  # Example: Use length as a basic embedding
    
    return sentences, embeddings

def semantic_chunking_with_gemini(client, text, threshold=50):
    """
    Splits text into semantic chunks based on the Gemini API's syntax analysis.
    """
    chunks = []
    # Chunk the text into smaller pieces suitable for the API
    text_chunks = chunk_text(text)
    for chunk in text_chunks:
        sentences, embeddings = get_sentence_embeddings(client, chunk)
        current_chunk = []
        for i in range(len(sentences) - 1):
            current_chunk.append(sentences[i])
            difference = abs(embeddings[i] - embeddings[i + 1])
            if difference > threshold:
                chunks.append(" ".join(current_chunk))
                current_chunk = []
        if current_chunk:
            chunks.append(" ".join(current_chunk))
    return chunks

# Open the file in read mode and load its content into a variable
with open('combined_nutrition_papers.txt', 'r') as file:
    combined_nutrition_papers = file.read()

# Print the content
text = combined_nutrition_papers


# Perform semantic chunking with chunking
print("#### Semantic Chunking with Chunking ####")
client = language_v2.LanguageServiceClient()
chunks = semantic_chunking_with_gemini(client, text, threshold=50)

# Print the resulting chunks
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i}:\n{chunk}\n")

#### Semantic Chunking with Chunking ####
Chunk 1:
article https://doi.org/10.1038/s41467-023-41969-1 the personalized nutrition study (points): evaluation of a genetically informed weightloss approach, a randomized clinical trial christoph höchsmann1,2, shengping yang2,j o s ém .o r d o v á s3, james l. dorling4, catherine m. champagne2, john w. apolzan2, frank l. greenway2,m i c h e l l ei .c a r d e l5,6,g a r yd .f o s t e r5,7& corby k. martin2 weight loss (wl) differences between iso caloric high-carbohydrate and high-fat diets are generally small; however, indi vidual wl varies within diet groups.

Chunk 2:
genotype patterns may modify diet effect s, with carbohydrate-responsive gen- otypes losing more weight on high-carbohydrate diets (and vice versa for fat-responsive genotypes). we investigate d whether 12-week wl (kg, primary out- come) differs between genotype-concor dant and genotype-discordant diets.

Chunk 3:
in this 12-week single-center wl trial, 14 5 participants with

In [32]:
import logging
import numpy as np  # For percentile calculation
from google.cloud import language_v1
from sklearn.metrics.pairwise import cosine_similarity  # Ensure sklearn is installed

# Set up logging
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")

def semantic_chunking(text, percentile_threshold=90):
    """
    Performs semantic chunking on the input text using Gemini embeddings.

    Args:
        text: The input text to be chunked.
        percentile_threshold: The percentile threshold for determining breakpoints.

    Returns:
        A list of text chunks.
    """

    if not text.strip():
        logging.error("Input text is empty or only contains whitespace.")
        raise ValueError("Input text cannot be empty.")
    
    logging.info("Initializing Google Cloud Language API client.")
    client = language_v1.LanguageServiceClient()

    # 1. Get Gemini embeddings for each sentence
    logging.info("Splitting input text into sentences.")
    sentences = [sentence.strip() for sentence in text.split(".") if sentence.strip()]
    logging.debug(f"Extracted {len(sentences)} sentences from the input text.")

    embeddings = []
    for sentence in sentences:
        try:
            logging.debug(f"Processing sentence: '{sentence}'")
            document = language_v1.Document(
                content=sentence, type_=language_v1.Document.Type.PLAIN_TEXT
            )
            response = client.analyze_sentiment(request={"document": document})
            embeddings.append([response.document_sentiment.magnitude])
            logging.debug(f"Obtained embedding: {response.document_sentiment.magnitude}")
        except Exception as e:
            logging.error(f"Error analyzing sentence: '{sentence}'. Error: {e}")
            raise

    if len(embeddings) < 2:
        logging.warning("Not enough sentences for similarity calculations. Returning original text as a single chunk.")
        return [text]

    # 2. Calculate sentence similarities (using cosine similarity)
    logging.info("Calculating sentence similarities using cosine similarity.")
    similarities = []
    for i in range(len(sentences) - 1):
        similarity = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0, 0]
        similarities.append(similarity)
        logging.debug(f"Similarity between sentence {i} and {i+1}: {similarity}")

    # 3. Determine breakpoint threshold
    logging.info("Calculating breakpoint threshold using percentile.")
    threshold = np.percentile(similarities, percentile_threshold)
    logging.debug(f"Similarity threshold set at {threshold} (percentile: {percentile_threshold})")

    # 4. Create chunks
    logging.info("Creating chunks based on similarity threshold.")
    chunks = []
    current_chunk = ""
    for i, sentence in enumerate(sentences):
        if i == 0:
            current_chunk += sentence + ". "
        elif similarities[i - 1] >= threshold:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            logging.debug(f"New chunk created: '{current_chunk.strip()}'")
            current_chunk = sentence + ". "
    chunks.append(current_chunk.strip())
    logging.debug(f"Final chunk created: '{current_chunk.strip()}'")

    logging.info(f"Chunking completed. Total chunks created: {len(chunks)}")
    return chunks


In [44]:
import logging
import numpy as np  # For percentile calculation
from google.cloud import language_v1
from sklearn.metrics.pairwise import cosine_similarity  # Ensure sklearn is installed
import time  # For implementing timeouts

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def semantic_chunking(text, percentile_threshold=90):
    """
    Performs semantic chunking on the input text using Gemini embeddings.

    Args:
        text: The input text to be chunked.
        percentile_threshold: The percentile threshold for determining breakpoints.

    Returns:
        A list of text chunks.
    """

    if not text.strip():
        logging.error("Input text is empty or only contains whitespace.")
        raise ValueError("Input text cannot be empty.")
    
    logging.info("Initializing Google Cloud Language API client.")
    client = language_v1.LanguageServiceClient()

    # 1. Get Gemini embeddings for each sentence
    sentences = [sentence.strip() for sentence in text.split(".") if sentence.strip()]

    embeddings = []
    for i, sentence in enumerate(sentences):
        try:
            document = language_v1.Document(
                content=sentence, type_=language_v1.Document.Type.PLAIN_TEXT
            )
            response = client.analyze_sentiment(request={"document": document})
            embeddings.append([response.document_sentiment.magnitude])

            # Add a timeout every 100 requests to manage quota limits
            if (i + 1) % 1000 == 0:
                logging.info("Reached 100 requests. Sleeping for 60 seconds to manage quota.")
                logging.info(f"Total sentences processed: {i+1}")
                time.sleep(10)

        except Exception as e:
            logging.error(f"Error analyzing sentence {i+1}: '{sentence}'. Error: {e}")
            continue
            raise

    if len(embeddings) < 2:
        logging.warning("Not enough sentences for similarity calculations. Returning original text as a single chunk.")
        return [text]

    # 2. Calculate sentence similarities (using cosine similarity)
    similarities = [
        cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0, 0]
        for i in range(len(sentences) - 1)
    ]

    # 3. Determine breakpoint threshold
    threshold = np.percentile(similarities, percentile_threshold)

    # 4. Create chunks
    chunks = []
    current_chunk = ""
    for i, sentence in enumerate(sentences):
        if i == 0:
            current_chunk += sentence + ". "
        elif similarities[i - 1] >= threshold:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    chunks.append(current_chunk.strip())

    logging.info(f"Chunking completed. Total chunks created: {len(chunks)}")
    return chunks


In [45]:
with open('combined_nutrition_papers.txt', 'r') as file:
    combined_nutrition_papers = file.read()

# Print the content
text = combined_nutrition_papers

In [46]:
chunks = semantic_chunking(text)

2025-01-17 16:07:11,736 - INFO - Initializing Google Cloud Language API client.
2025-01-17 16:08:59,900 - INFO - Reached 100 requests. Sleeping for 60 seconds to manage quota.
2025-01-17 16:08:59,901 - INFO - Total sentences processed: 1000
2025-01-17 16:10:23,418 - ERROR - Error analyzing sentence 1763: 'the number of stars for a specif ic food group indicates the level of compliance with the dutch dietary guidelines'. Error: 429 Quota exceeded for quota metric 'Requests' and limit 'Requests per minute' of service 'language.googleapis.com' for consumer 'project_number:543798683069'. [reason: "RATE_LIMIT_EXCEEDED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "language.googleapis.com"
}
metadata {
  key: "quota_metric"
  value: "language.googleapis.com/default_requests"
}
metadata {
  key: "quota_location"
  value: "global"
}
metadata {
  key: "quota_limit"
  value: "DefaultRequestsPerMinutePerProject"
}
metadata {
  key: "quota_limit_value"
  value: "600"
}
metadata {


KeyboardInterrupt: 

In [None]:
import logging
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow_hub as hub

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def semantic_chunking_with_embeddings(text, percentile_threshold=90):
    """
    Perform semantic chunking using sentence embeddings.

    Args:
        text: Input text to be chunked.
        percentile_threshold: Threshold for similarity percentile to determine chunk boundaries.

    Returns:
        List of text chunks.
    """
    if not text.strip():
        logging.error("Input text is empty or only contains whitespace.")
        raise ValueError("Input text cannot be empty.")
    
    logging.info("Loading Universal Sentence Encoder model...")
    embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    
    # Split text into sentences
    sentences = [sentence.strip() for sentence in text.split(".") if sentence.strip()]
    logging.info(f"Total sentences to process: {len(sentences)}")
    
    # Generate embeddings
    logging.info("Generating sentence embeddings...")
    embeddings = embed(sentences).numpy()
    
    # Calculate cosine similarity between consecutive sentence embeddings
    logging.info("Calculating sentence similarities...")
    similarities = [
        cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0, 0]
        for i in range(len(sentences) - 1)
    ]
    
    # Determine similarity threshold for chunking
    threshold = np.percentile(similarities, percentile_threshold)
    logging.info(f"Similarity threshold for chunking: {threshold:.2f}")
    
    # Create chunks based on similarity
    logging.info("Creating semantic chunks...")
    chunks = []
    current_chunk = ""
    
    for i, sentence in enumerate(sentences):
        if i == 0 or similarities[i - 1] >= threshold:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    chunks.append(current_chunk.strip())  # Add the last chunk
    
    logging.info(f"Chunking completed. Total chunks created: {len(chunks)}")
    return chunks

# Load the text file
with open('combined_nutrition_papers.txt', 'r') as file:
    combined_nutrition_papers = file.read()

# Perform semantic chunking
chunks = semantic_chunking_with_embeddings(combined_nutrition_papers)

# Save the chunks to a file
with open("chunked_output.txt", "w") as output_file:
    for i, chunk in enumerate(chunks, 1):
        output_file.write(f"Chunk {i}:\n{chunk}\n\n")
logging.info("Chunked output saved to 'chunked_output.txt'.")
