In [None]:
import json
import os
import time
import traceback
from pinecone import Pinecone
# Note: We removed the specific import for ApiException as it was causing errors.
# We will rely on the general Exception catch for Pinecone API errors.

# --- Pinecone Configuration ---
# IMPORTANT: Replace with your actual API key and index host
PINECONE_API_KEY = ""
PINECONE_INDEX_HOST =""
PINECONE_NAMESPACE = "astrollava-embeddings" # Optional: Specify a namespace within your index

# --- Input File Configuration ---
# Adjust the filename to match the JSONL file you created
INPUT_JSONL_FILE = "combined_astrollava_embeddings.jsonl" # <<< MAKE SURE THIS FILENAME IS CORRECT

# --- Upsert Configuration ---
UPSERT_BATCH_SIZE = 100 # Pinecone recommends batch sizes up to 100 for upserts

# --- Helper Function to Validate Record ---
def validate_record(record_dict, line_num):
    """Checks if a dictionary loaded from JSONL has the required structure."""
    if not isinstance(record_dict, dict):
        print(f"Warning: Line {line_num}: Expected a dictionary, got {type(record_dict)}. Skipping.")
        return None, None, None

    embedding = record_dict.get("embedding")
    metadata = record_dict.get("metadata")

    if not isinstance(embedding, list):
        print(f"Warning: Line {line_num}: 'embedding' field is missing or not a list. Skipping.")
        return None, None, None
    # Optional: Add check for embedding dimension if known (e.g., len(embedding) == EXPECTED_DIMENSION)

    if not isinstance(metadata, dict):
        print(f"Warning: Line {line_num}: 'metadata' field is missing or not a dictionary. Skipping.")
        return None, None, None

    # Extract the ID from *within* the metadata dictionary
    vector_id = metadata.get("id")
    if not isinstance(vector_id, str) or not vector_id.strip():
        # Fallback: Try 'original_row_index' if 'id' is missing/invalid
        original_index = metadata.get("original_row_index")
        if isinstance(original_index, int):
            vector_id = f"row_{original_index}"
            # print(f"Warning: Line {line_num}: Using fallback ID '{vector_id}' as metadata 'id' was missing or invalid.") # Less verbose
        else:
            print(f"Warning: Line {line_num}: Cannot determine a valid string ID from metadata ('id' or 'original_row_index'). Skipping.")
            return None, None, None

    # Return the essential components for Pinecone upsert
    return vector_id, embedding, metadata


# --- Main Upsert Logic ---
print("--- Starting Pinecone Upsert Process ---")

# --- Sanity Checks ---
if PINECONE_API_KEY == "YOUR_API_KEY" or not PINECONE_API_KEY:
    print("Error: Pinecone API key not set. Please set the PINECONE_API_KEY environment variable or replace 'YOUR_API_KEY' in the script.")
    exit()

if PINECONE_INDEX_HOST == "YOUR_INDEX_HOST" or not PINECONE_INDEX_HOST:
    print("Error: Pinecone index host not set. Please set the PINECONE_INDEX_HOST environment variable or replace 'YOUR_INDEX_HOST' in the script.")
    print("See: https://docs.pinecone.io/guides/data/target-an-index")
    exit()

if not os.path.exists(INPUT_JSONL_FILE):
    print(f"Error: Input file not found: '{INPUT_JSONL_FILE}'")
    exit()

# --- Initialize Pinecone Connection ---
try:
    print(f"Initializing Pinecone connection...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    # Target the specific index host
    index = pc.Index(host=PINECONE_INDEX_HOST)
    print(f"Connected to Pinecone index host: {PINECONE_INDEX_HOST}")
    # Optional: Print index stats to confirm connection and see initial state
    print("Initial index stats:", index.describe_index_stats())

except Exception as e:
    print(f"Error initializing Pinecone or connecting to index: {e}")
    traceback.print_exc() # Print full traceback for initialization errors
    exit()

# --- Read JSONL and Upsert in Batches ---
batch_to_upsert = []
total_lines_read = 0
total_vectors_upserted = 0
total_skipped = 0

print(f"Reading from '{INPUT_JSONL_FILE}' and upserting to namespace '{PINECONE_NAMESPACE}'...")

try:
    with open(INPUT_JSONL_FILE, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            total_lines_read += 1
            line = line.strip()
            if not line:
                continue # Skip empty lines

            # Parse JSON line
            try:
                record_data = json.loads(line)
            except json.JSONDecodeError:
                print(f"Warning: Line {line_num}: Invalid JSON. Skipping line: {line[:100]}...")
                total_skipped += 1
                continue

            # Validate and extract data
            vector_id, vector_values, vector_metadata = validate_record(record_data, line_num)

            if vector_id is None: # Skip if validation failed
                total_skipped += 1
                continue

            # --- Prepare vector for Pinecone ---
            vector_obj = {
                "id": vector_id,
                "values": vector_values,
                "metadata": vector_metadata # Pass the whole metadata dict from JSONL
            }
            batch_to_upsert.append(vector_obj)

            # --- Upsert when batch is full ---
            if len(batch_to_upsert) >= UPSERT_BATCH_SIZE:
                print(f"Upserting batch of {len(batch_to_upsert)} vectors (Total lines processed: {total_lines_read})...")
                try:
                    upsert_response = index.upsert(vectors=batch_to_upsert, namespace=PINECONE_NAMESPACE)
                    # Check if upserted_count exists in the response, handle potential absence
                    upserted_count = getattr(upsert_response, 'upserted_count', len(batch_to_upsert)) # Default to batch size if count absent
                    total_vectors_upserted += upserted_count
                    print(f"  Successfully upserted batch (reported count: {upserted_count}).")
                # Catch ALL exceptions during upsert, print type and message
                except Exception as e:
                    print(f"Error during Pinecone upsert batch (lines ~{line_num-UPSERT_BATCH_SIZE+1} to {line_num}):")
                    print(f"  Error Type: {type(e).__name__}") # Print the actual exception type
                    print(f"  Error Details: {e}")
                    print("  Skipping this batch and continuing...")
                    total_skipped += len(batch_to_upsert) # Count skipped items from the failed batch
                    # Optional: Add traceback print here if needed for deep debugging
                    # traceback.print_exc()
                finally:
                    batch_to_upsert = [] # Clear batch regardless of success or failure

except FileNotFoundError:
    print(f"Error: Input file not found at path: {INPUT_JSONL_FILE}")
except IOError as e:
    print(f"Error reading file {INPUT_JSONL_FILE}: {e}")
except Exception as e:
    print(f"An unexpected error occurred during file processing: {e}")
    traceback.print_exc() # Print traceback for unexpected file errors

# --- Upsert any remaining vectors in the last batch ---
if batch_to_upsert:
    print(f"Upserting final batch of {len(batch_to_upsert)} vectors...")
    try:
        upsert_response = index.upsert(vectors=batch_to_upsert, namespace=PINECONE_NAMESPACE)
        # Check if upserted_count exists in the response
        upserted_count = getattr(upsert_response, 'upserted_count', len(batch_to_upsert)) # Default to batch size if count absent
        total_vectors_upserted += upserted_count
        print(f"  Successfully upserted final batch (reported count: {upserted_count}).")
    # Catch ALL exceptions during final upsert
    except Exception as e:
        print(f"Error during final Pinecone upsert batch:")
        print(f"  Error Type: {type(e).__name__}") # Print the actual exception type
        print(f"  Error Details: {e}")
        total_skipped += len(batch_to_upsert)
        # Optional: Add traceback print here if needed for deep debugging
        # traceback.print_exc()
    finally:
        batch_to_upsert = [] # Clear batch

# --- Final Summary ---
print("\n--- Pinecone Upsert Summary ---")
print(f"Input File:          '{INPUT_JSONL_FILE}'")
print(f"Target Namespace:    '{PINECONE_NAMESPACE}'")
print(f"Total Lines Read:    {total_lines_read}")
print(f"Total Vectors Upserted: {total_vectors_upserted}") # Based on reported counts or batch sizes
print(f"Total Records Skipped: {total_skipped}") # Due to validation or upsert errors
print("-------------------------------")

# Optional: Verify final count in Pinecone
try:
    print("Fetching final index stats...")
    final_stats = index.describe_index_stats()
    print("Final index stats:", final_stats)
    # Access namespace counts safely
    namespace_stats = final_stats.namespaces.get(PINECONE_NAMESPACE) if final_stats.namespaces else None
    if namespace_stats:
        print(f"Vector count in namespace '{PINECONE_NAMESPACE}': {namespace_stats.vector_count}")
    else:
         print(f"Namespace '{PINECONE_NAMESPACE}' not found in index stats or no namespaces present.")
except Exception as e:
    print(f"Could not fetch final index stats: {e}")

print("Script finished.")

--- Starting Pinecone Upsert Process ---
Initializing Pinecone connection...
Connected to Pinecone index host: https://text-collection-4arfk3f.svc.aped-4627-b74a.pinecone.io
Initial index stats: {'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}
Reading from 'combined_astrollava_embeddings.jsonl' and upserting to namespace 'astrollava-embeddings'...
Upserting batch of 100 vectors (Total lines processed: 100)...
  Successfully upserted batch (reported count: 100).
Upserting batch of 100 vectors (Total lines processed: 200)...
  Successfully upserted batch (reported count: 100).
Upserting batch of 100 vectors (Total lines processed: 300)...
  Successfully upserted batch (reported count: 100).
Upserting batch of 100 vectors (Total lines processed: 400)...
  Successfully upserted batch (reported count: 100).
Upserting batch of 100 vectors (Total lines processed: 500)...
  Successfully upserted batch (report

In [None]:
import json
import os
import time
import traceback
from pinecone import Pinecone
# Note: We removed the specific import for ApiException as it was causing errors.
# We will rely on the general Exception catch for Pinecone API errors.

# --- Pinecone Configuration ---
# IMPORTANT: Replace with your actual API key and index host
PINECONE_API_KEY = ""
PINECONE_INDEX_HOST =""
PINECONE_NAMESPACE = "astrollava-embeddings" # Optional: Specify a namespace within your index

# --- Input File Configuration ---
# Adjust the filename to match the JSONL file you created
INPUT_JSONL_FILE = "hubble_embeddings_rows_0_to_2705_new.jsonl" # <<< MAKE SURE THIS FILENAME IS CORRECT

# --- Upsert Configuration ---
UPSERT_BATCH_SIZE = 100 # Pinecone recommends batch sizes up to 100 for upserts

# --- Helper Function to Validate Record ---
def validate_record(record_dict, line_num):
    """Checks if a dictionary loaded from JSONL has the required structure."""
    if not isinstance(record_dict, dict):
        print(f"Warning: Line {line_num}: Expected a dictionary, got {type(record_dict)}. Skipping.")
        return None, None, None

    embedding = record_dict.get("embedding")
    metadata = record_dict.get("metadata")

    if not isinstance(embedding, list):
        print(f"Warning: Line {line_num}: 'embedding' field is missing or not a list. Skipping.")
        return None, None, None
    # Optional: Add check for embedding dimension if known (e.g., len(embedding) == EXPECTED_DIMENSION)

    if not isinstance(metadata, dict):
        print(f"Warning: Line {line_num}: 'metadata' field is missing or not a dictionary. Skipping.")
        return None, None, None

    # Extract the ID from *within* the metadata dictionary
    vector_id = metadata.get("id")
    if not isinstance(vector_id, str) or not vector_id.strip():
        # Fallback: Try 'original_row_index' if 'id' is missing/invalid
        original_index = metadata.get("original_row_index")
        if isinstance(original_index, int):
            vector_id = f"row_{original_index}"
            # print(f"Warning: Line {line_num}: Using fallback ID '{vector_id}' as metadata 'id' was missing or invalid.") # Less verbose
        else:
            print(f"Warning: Line {line_num}: Cannot determine a valid string ID from metadata ('id' or 'original_row_index'). Skipping.")
            return None, None, None

    # Return the essential components for Pinecone upsert
    return vector_id, embedding, metadata


# --- Main Upsert Logic ---
print("--- Starting Pinecone Upsert Process ---")

# --- Sanity Checks ---
if PINECONE_API_KEY == "YOUR_API_KEY" or not PINECONE_API_KEY:
    print("Error: Pinecone API key not set. Please set the PINECONE_API_KEY environment variable or replace 'YOUR_API_KEY' in the script.")
    exit()

if PINECONE_INDEX_HOST == "YOUR_INDEX_HOST" or not PINECONE_INDEX_HOST:
    print("Error: Pinecone index host not set. Please set the PINECONE_INDEX_HOST environment variable or replace 'YOUR_INDEX_HOST' in the script.")
    print("See: https://docs.pinecone.io/guides/data/target-an-index")
    exit()

if not os.path.exists(INPUT_JSONL_FILE):
    print(f"Error: Input file not found: '{INPUT_JSONL_FILE}'")
    exit()

# --- Initialize Pinecone Connection ---
try:
    print(f"Initializing Pinecone connection...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    # Target the specific index host
    index = pc.Index(host=PINECONE_INDEX_HOST)
    print(f"Connected to Pinecone index host: {PINECONE_INDEX_HOST}")
    # Optional: Print index stats to confirm connection and see initial state
    print("Initial index stats:", index.describe_index_stats())

except Exception as e:
    print(f"Error initializing Pinecone or connecting to index: {e}")
    traceback.print_exc() # Print full traceback for initialization errors
    exit()

# --- Read JSONL and Upsert in Batches ---
batch_to_upsert = []
total_lines_read = 0
total_vectors_upserted = 0
total_skipped = 0

print(f"Reading from '{INPUT_JSONL_FILE}' and upserting to namespace '{PINECONE_NAMESPACE}'...")

try:
    with open(INPUT_JSONL_FILE, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            total_lines_read += 1
            line = line.strip()
            if not line:
                continue # Skip empty lines

            # Parse JSON line
            try:
                record_data = json.loads(line)
            except json.JSONDecodeError:
                print(f"Warning: Line {line_num}: Invalid JSON. Skipping line: {line[:100]}...")
                total_skipped += 1
                continue

            # Validate and extract data
            vector_id, vector_values, vector_metadata = validate_record(record_data, line_num)

            if vector_id is None: # Skip if validation failed
                total_skipped += 1
                continue

            # --- Prepare vector for Pinecone ---
            vector_obj = {
                "id": vector_id,
                "values": vector_values,
                "metadata": vector_metadata # Pass the whole metadata dict from JSONL
            }
            batch_to_upsert.append(vector_obj)

            # --- Upsert when batch is full ---
            if len(batch_to_upsert) >= UPSERT_BATCH_SIZE:
                print(f"Upserting batch of {len(batch_to_upsert)} vectors (Total lines processed: {total_lines_read})...")
                try:
                    upsert_response = index.upsert(vectors=batch_to_upsert, namespace=PINECONE_NAMESPACE)
                    # Check if upserted_count exists in the response, handle potential absence
                    upserted_count = getattr(upsert_response, 'upserted_count', len(batch_to_upsert)) # Default to batch size if count absent
                    total_vectors_upserted += upserted_count
                    print(f"  Successfully upserted batch (reported count: {upserted_count}).")
                # Catch ALL exceptions during upsert, print type and message
                except Exception as e:
                    print(f"Error during Pinecone upsert batch (lines ~{line_num-UPSERT_BATCH_SIZE+1} to {line_num}):")
                    print(f"  Error Type: {type(e).__name__}") # Print the actual exception type
                    print(f"  Error Details: {e}")
                    print("  Skipping this batch and continuing...")
                    total_skipped += len(batch_to_upsert) # Count skipped items from the failed batch
                    # Optional: Add traceback print here if needed for deep debugging
                    # traceback.print_exc()
                finally:
                    batch_to_upsert = [] # Clear batch regardless of success or failure

except FileNotFoundError:
    print(f"Error: Input file not found at path: {INPUT_JSONL_FILE}")
except IOError as e:
    print(f"Error reading file {INPUT_JSONL_FILE}: {e}")
except Exception as e:
    print(f"An unexpected error occurred during file processing: {e}")
    traceback.print_exc() # Print traceback for unexpected file errors

# --- Upsert any remaining vectors in the last batch ---
if batch_to_upsert:
    print(f"Upserting final batch of {len(batch_to_upsert)} vectors...")
    try:
        upsert_response = index.upsert(vectors=batch_to_upsert, namespace=PINECONE_NAMESPACE)
        # Check if upserted_count exists in the response
        upserted_count = getattr(upsert_response, 'upserted_count', len(batch_to_upsert)) # Default to batch size if count absent
        total_vectors_upserted += upserted_count
        print(f"  Successfully upserted final batch (reported count: {upserted_count}).")
    # Catch ALL exceptions during final upsert
    except Exception as e:
        print(f"Error during final Pinecone upsert batch:")
        print(f"  Error Type: {type(e).__name__}") # Print the actual exception type
        print(f"  Error Details: {e}")
        total_skipped += len(batch_to_upsert)
        # Optional: Add traceback print here if needed for deep debugging
        # traceback.print_exc()
    finally:
        batch_to_upsert = [] # Clear batch

# --- Final Summary ---
print("\n--- Pinecone Upsert Summary ---")
print(f"Input File:          '{INPUT_JSONL_FILE}'")
print(f"Target Namespace:    '{PINECONE_NAMESPACE}'")
print(f"Total Lines Read:    {total_lines_read}")
print(f"Total Vectors Upserted: {total_vectors_upserted}") # Based on reported counts or batch sizes
print(f"Total Records Skipped: {total_skipped}") # Due to validation or upsert errors
print("-------------------------------")

# Optional: Verify final count in Pinecone
try:
    print("Fetching final index stats...")
    final_stats = index.describe_index_stats()
    print("Final index stats:", final_stats)
    # Access namespace counts safely
    namespace_stats = final_stats.namespaces.get(PINECONE_NAMESPACE) if final_stats.namespaces else None
    if namespace_stats:
        print(f"Vector count in namespace '{PINECONE_NAMESPACE}': {namespace_stats.vector_count}")
    else:
         print(f"Namespace '{PINECONE_NAMESPACE}' not found in index stats or no namespaces present.")
except Exception as e:
    print(f"Could not fetch final index stats: {e}")

print("Script finished.")

--- Starting Pinecone Upsert Process ---
Initializing Pinecone connection...
Connected to Pinecone index host: https://text-collection-4arfk3f.svc.aped-4627-b74a.pinecone.io
Initial index stats: {'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'astrollava-embeddings': {'vector_count': 26245}},
 'total_vector_count': 26245,
 'vector_type': 'dense'}
Reading from 'hubble_embeddings_rows_0_to_2705_new.jsonl' and upserting to namespace 'astrollava-embeddings'...
Upserting batch of 100 vectors (Total lines processed: 100)...
Error during Pinecone upsert batch (lines ~1 to 100):
  Error Type: PineconeApiException
  Error Details: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Sun, 20 Apr 2025 09:58:08 GMT', 'Content-Type': 'application/json', 'Content-Length': '132', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '1478', 'x-pinecone-request-id': '8040026065756532363', 'x-envoy-upstream-service-time': '22', 'server': 'env

In [None]:
import json
import os
import time
import traceback
from pinecone import Pinecone
# Import exception if needed, otherwise rely on general Exception
try:
    from pinecone.exceptions import ApiException
except ImportError:
    ApiException = Exception # Fallback

# --- Pinecone Configuration ---
PINECONE_API_KEY = ""
PINECONE_INDEX_HOST =""
# Make sure this namespace matches where you intend to query later
PINECONE_NAMESPACE = "hubble-embeddings" # <<< Changed namespace to reflect data source

# --- Input File Configuration ---
INPUT_JSONL_FILE = "hubble_embeddings_rows_0_to_2705_new.jsonl" # <<< VERIFY FILENAME

# --- Upsert Configuration ---
UPSERT_BATCH_SIZE = 100

# --- Helper Function to Clean Metadata --- <<< NEW FUNCTION
def clean_metadata(metadata_dict):
    """Replaces None values in a metadata dictionary with empty strings."""
    if not isinstance(metadata_dict, dict):
        # Handle cases where metadata might be missing or invalid itself
        return {}
    cleaned_meta = {}
    for key, value in metadata_dict.items():
        if value is None:
            cleaned_meta[key] = "" # Replace None with empty string
        # Optional: Handle lists - ensure they only contain strings if required by Pinecone
        # elif isinstance(value, list):
        #     # Filter list to only contain strings, numbers, or booleans
        #     cleaned_list = [item for item in value if isinstance(item, (str, int, float, bool))]
        #     # Pinecone allows lists of strings, adjust if needed
        #     cleaned_meta[key] = [str(item) for item in cleaned_list] # Example: Convert all to string list
        else:
            # Keep allowed types: string, number (int/float), boolean
            if isinstance(value, (str, int, float, bool, list)): # Check if value is of an allowed type or list
                 cleaned_meta[key] = value
            else:
                 # Convert potentially problematic types (like nested dicts) to string or skip
                 print(f"Warning: Metadata field '{key}' has unsupported type {type(value)}. Converting to string.")
                 cleaned_meta[key] = str(value) # Convert to string as a fallback
    return cleaned_meta

# --- Helper Function to Validate Record ---
def validate_record(record_dict, line_num):
    """Checks if a dictionary loaded from JSONL has the required structure."""
    if not isinstance(record_dict, dict):
        print(f"Warning: Line {line_num}: Expected a dictionary, got {type(record_dict)}. Skipping.")
        return None, None, None

    embedding = record_dict.get("embedding")
    metadata = record_dict.get("metadata")

    if not isinstance(embedding, list):
        print(f"Warning: Line {line_num}: 'embedding' field is missing or not a list. Skipping.")
        return None, None, None

    if not isinstance(metadata, dict):
        print(f"Warning: Line {line_num}: 'metadata' field is missing or not a dictionary. Skipping.")
        return None, None, None

    vector_id = metadata.get("id")
    if not isinstance(vector_id, str) or not vector_id.strip():
        original_index = metadata.get("original_row_index")
        if isinstance(original_index, int):
            vector_id = f"row_{original_index}"
        else:
            print(f"Warning: Line {line_num}: Cannot determine valid string ID. Skipping.")
            return None, None, None

    return vector_id, embedding, metadata


# --- Main Upsert Logic ---
print("--- Starting Pinecone Upsert Process ---")

# --- Sanity Checks ---
# (Keep the API key and host checks)
if PINECONE_API_KEY == "YOUR_API_KEY" or not PINECONE_API_KEY: exit("Error: Pinecone API key not set.")
if PINECONE_INDEX_HOST == "YOUR_INDEX_HOST" or not PINECONE_INDEX_HOST: exit("Error: Pinecone index host not set.")
if not os.path.exists(INPUT_JSONL_FILE): exit(f"Error: Input file not found: '{INPUT_JSONL_FILE}'")

# --- Initialize Pinecone Connection ---
try:
    print(f"Initializing Pinecone connection...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    index = pc.Index(host=PINECONE_INDEX_HOST)
    print(f"Connected to Pinecone index host: {PINECONE_INDEX_HOST}")
    print("Initial index stats:", index.describe_index_stats())
except Exception as e:
    print(f"Error initializing Pinecone or connecting to index: {e}")
    traceback.print_exc()
    exit()

# --- Read JSONL and Upsert in Batches ---
batch_to_upsert = []
total_lines_read = 0
total_vectors_upserted = 0
total_skipped = 0

print(f"Reading from '{INPUT_JSONL_FILE}' and upserting to namespace '{PINECONE_NAMESPACE}'...")

try:
    with open(INPUT_JSONL_FILE, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            total_lines_read += 1
            line = line.strip()
            if not line: continue

            try:
                record_data = json.loads(line)
            except json.JSONDecodeError:
                print(f"Warning: Line {line_num}: Invalid JSON. Skipping.")
                total_skipped += 1
                continue

            vector_id, vector_values, vector_metadata = validate_record(record_data, line_num)
            if vector_id is None:
                total_skipped += 1
                continue

            # <<< --- CLEAN METADATA before upserting --- >>>
            cleaned_vector_metadata = clean_metadata(vector_metadata)

            # --- Prepare vector for Pinecone ---
            vector_obj = {
                "id": vector_id,
                "values": vector_values,
                "metadata": cleaned_vector_metadata # <<< Use the cleaned metadata
            }
            batch_to_upsert.append(vector_obj)

            # --- Upsert when batch is full ---
            if len(batch_to_upsert) >= UPSERT_BATCH_SIZE:
                print(f"Upserting batch of {len(batch_to_upsert)} vectors (Total lines processed: {total_lines_read})...")
                try:
                    upsert_response = index.upsert(vectors=batch_to_upsert, namespace=PINECONE_NAMESPACE)
                    upserted_count = getattr(upsert_response, 'upserted_count', len(batch_to_upsert))
                    total_vectors_upserted += upserted_count
                    print(f"  Successfully upserted batch (reported count: {upserted_count}).")
                except Exception as e: # Catch general exception which might include PineconeApiException
                    print(f"Error during Pinecone upsert batch (lines ~{line_num-UPSERT_BATCH_SIZE+1} to {line_num}):")
                    print(f"  Error Type: {type(e).__name__}")
                    print(f"  Error Details: {e}")
                    # Check if the error message indicates metadata issues, even with cleaning
                    if "Metadata value" in str(e):
                         print("  Potential lingering metadata issue detected. Check cleaning logic.")
                         # You might want to inspect the failing batch here:
                         # print("  Problematic batch metadata sample:")
                         # for item in batch_to_upsert[:5]: print(f"    ID: {item['id']}, Meta: {item['metadata']}")
                    print("  Skipping this batch and continuing...")
                    total_skipped += len(batch_to_upsert)
                finally:
                    batch_to_upsert = []

except FileNotFoundError: print(f"Error: Input file not found: {INPUT_JSONL_FILE}")
except IOError as e: print(f"Error reading file {INPUT_JSONL_FILE}: {e}")
except Exception as e: print(f"An unexpected error occurred during file processing: {e}"); traceback.print_exc()

# --- Upsert final batch ---
if batch_to_upsert:
    print(f"Upserting final batch of {len(batch_to_upsert)} vectors...")
    try:
        upsert_response = index.upsert(vectors=batch_to_upsert, namespace=PINECONE_NAMESPACE)
        upserted_count = getattr(upsert_response, 'upserted_count', len(batch_to_upsert))
        total_vectors_upserted += upserted_count
        print(f"  Successfully upserted final batch (reported count: {upserted_count}).")
    except Exception as e:
        print(f"Error during final Pinecone upsert batch:")
        print(f"  Error Type: {type(e).__name__}")
        print(f"  Error Details: {e}")
        total_skipped += len(batch_to_upsert)
    finally:
        batch_to_upsert = []

# --- Final Summary ---
print("\n--- Pinecone Upsert Summary ---")
print(f"Input File:          '{INPUT_JSONL_FILE}'")
print(f"Target Namespace:    '{PINECONE_NAMESPACE}'") # Changed to hubble-embeddings
print(f"Total Lines Read:    {total_lines_read}")
print(f"Total Vectors Upserted: {total_vectors_upserted}")
print(f"Total Records Skipped: {total_skipped}")
print("-------------------------------")

# Optional: Verify final count
try:
    print("Fetching final index stats...")
    final_stats = index.describe_index_stats()
    print("Final index stats:", final_stats)
    namespace_stats = final_stats.namespaces.get(PINECONE_NAMESPACE) if final_stats.namespaces else None
    if namespace_stats: print(f"Vector count in namespace '{PINECONE_NAMESPACE}': {namespace_stats.vector_count}")
    else: print(f"Namespace '{PINECONE_NAMESPACE}' not found in index stats.")
except Exception as e: print(f"Could not fetch final index stats: {e}")

print("Script finished.")

--- Starting Pinecone Upsert Process ---
Initializing Pinecone connection...
Connected to Pinecone index host: https://text-collection-4arfk3f.svc.aped-4627-b74a.pinecone.io
Initial index stats: {'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'astrollava-embeddings': {'vector_count': 26260}},
 'total_vector_count': 26260,
 'vector_type': 'dense'}
Reading from 'hubble_embeddings_rows_0_to_2705_new.jsonl' and upserting to namespace 'hubble-embeddings'...
Upserting batch of 100 vectors (Total lines processed: 100)...
  Successfully upserted batch (reported count: 100).
Upserting batch of 100 vectors (Total lines processed: 200)...
  Successfully upserted batch (reported count: 100).
Upserting batch of 100 vectors (Total lines processed: 300)...
  Successfully upserted batch (reported count: 100).
Upserting batch of 100 vectors (Total lines processed: 400)...
  Successfully upserted batch (reported count: 100).
Upserting batch of 100 vectors (Total lines pr

In [6]:
import json
from collections import Counter

INPUT_JSONL_FILE = "hubble_embeddings_rows_0_to_2705_new.jsonl" # Make sure this is correct
generated_ids = []
processed_lines = 0
skipped_lines = 0

print(f"Analyzing IDs in {INPUT_JSONL_FILE}...")

try:
    with open(INPUT_JSONL_FILE, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                skipped_lines +=1
                continue

            processed_lines += 1
            try:
                record_data = json.loads(line)
                metadata = record_data.get("metadata")
                if not isinstance(metadata, dict):
                    print(f"Warning: Missing or invalid metadata on line {line_num}. Skipping ID check for this line.")
                    skipped_lines += 1
                    continue

                vector_id = metadata.get("id")
                if not isinstance(vector_id, str) or not vector_id.strip():
                    original_index = metadata.get("original_row_index")
                    if isinstance(original_index, int):
                        vector_id = f"row_{original_index}" # Use fallback ID
                    else:
                         print(f"Warning: Cannot determine ID for line {line_num}. Skipping ID check for this line.")
                         skipped_lines += 1
                         continue

                generated_ids.append(vector_id)

            except json.JSONDecodeError:
                print(f"Warning: Invalid JSON on line {line_num}. Skipping ID check.")
                skipped_lines += 1
            except Exception as e:
                print(f"Error processing line {line_num}: {e}")
                skipped_lines += 1

except FileNotFoundError:
    print(f"Error: File not found: {INPUT_JSONL_FILE}")
    exit()

print("-" * 30)
print(f"File Analysis Complete:")
print(f"  Total lines processed: {processed_lines}")
print(f"  Total lines skipped (empty/error/no ID): {skipped_lines}")
print(f"  Number of IDs generated: {len(generated_ids)}")
print(f"  Number of UNIQUE IDs generated: {len(set(generated_ids))}") # Crucial comparison!

id_counts = Counter(generated_ids)
duplicates = {id_val: count for id_val, count in id_counts.items() if count > 1}

print(f"  Number of duplicate IDs found: {len(duplicates)}")

if duplicates:
    print("\nDuplicate IDs and their counts:")
    # Print first few duplicates for inspection
    for i, (id_val, count) in enumerate(duplicates.items()):
        if i < 20: # Limit output
             print(f"  - ID: '{id_val}', Count: {count}")
        elif i == 20:
             print("  ... (and potentially more)")
             break
else:
    print("\nNo duplicate IDs were found within the file.")

print("-" * 30)

# Check if the numbers match
expected_unique = processed_lines - skipped_lines
if len(set(generated_ids)) == expected_unique:
     print("The number of unique IDs matches the number of processable lines.")
     if expected_unique != 2592:
          print(f"However, the unique count ({expected_unique}) doesn't match the Pinecone count (2592).")
          print("This might indicate an issue with pre-existing data in the namespace or unreported batch errors.")
else:
     discrepancy = len(generated_ids) - len(set(generated_ids))
     print(f"Found {discrepancy} duplicate ID instances.")
     calculated_final_count = len(set(generated_ids))
     print(f"Calculated final count based on unique IDs: {calculated_final_count}")
     if calculated_final_count == 2592:
          print("This matches the observed Pinecone count, confirming duplicates in the input file were the cause.")
     else:
          print(f"This ({calculated_final_count}) does NOT match the observed Pinecone count (2592).")
          print("There might be multiple issues (duplicates AND pre-existing data/errors).")

Analyzing IDs in hubble_embeddings_rows_0_to_2705_new.jsonl...
------------------------------
File Analysis Complete:
  Total lines processed: 2706
  Total lines skipped (empty/error/no ID): 0
  Number of IDs generated: 2706
  Number of UNIQUE IDs generated: 2592
  Number of duplicate IDs found: 50

Duplicate IDs and their counts:
  - ID: 'heic0514c', Count: 4
  - ID: 'heic0601a', Count: 4
  - ID: 'heic0602a', Count: 4
  - ID: 'heic0602i', Count: 2
  - ID: 'heic0603d', Count: 4
  - ID: 'heic0604h', Count: 2
  - ID: 'heic0619e', Count: 2
  - ID: 'heic0701h', Count: 4
  - ID: 'heic0710a', Count: 4
  - ID: 'heic0710k', Count: 2
  - ID: 'heic0712g', Count: 4
  - ID: 'heic0806c', Count: 4
  - ID: 'heic0809b', Count: 4
  - ID: 'heic0815i', Count: 4
  - ID: 'heic0819b', Count: 2
  - ID: 'heic0902b', Count: 4
  - ID: 'heic0905b', Count: 2
  - ID: 'heic0911f', Count: 4
  - ID: 'heic0917ab', Count: 2
  - ID: 'heic1112e', Count: 4
  ... (and potentially more)
------------------------------
Found 

In [None]:
import json
import os
import time
import traceback
from pinecone import Pinecone
# Import exception if needed, otherwise rely on general Exception
try:
    from pinecone.exceptions import ApiException
except ImportError:
    ApiException = Exception # Fallback

# --- Pinecone Configuration ---
PINECONE_API_KEY = "" # Replace if needed
PINECONE_INDEX_HOST ="" # Replace if needed
# <<< Set the correct namespace for THIS data >>>
PINECONE_NAMESPACE = "clip-embeddings" # Namespace for the CLIP/AstroLLaVA embeddings

# --- Input File Configuration ---
INPUT_JSONL_FILE = "astro_embeddings_clip_new.jsonl" # <<< Filename for the new format

# --- Upsert Configuration ---
UPSERT_BATCH_SIZE = 100

# --- Helper Function to Clean Metadata --- (Keep this function as is)
def clean_metadata(metadata_dict):
    """Replaces None values in a metadata dictionary with empty strings
       and handles potentially unsupported types."""
    if not isinstance(metadata_dict, dict): return {}
    cleaned_meta = {}
    for key, value in metadata_dict.items():
        if value is None:
            cleaned_meta[key] = "" # Replace None with empty string
        # Pinecone supports str, bool, float, int, list[str]
        elif isinstance(value, (str, bool, float, int)):
            cleaned_meta[key] = value
        elif isinstance(value, list):
             # Ensure list only contains strings for Pinecone compatibility
             cleaned_meta[key] = [str(item) for item in value if isinstance(item, (str, bool, float, int))]
        else:
            # Convert other types (like nested dicts if any) to string
            print(f"Warning: Metadata field '{key}' has unsupported type {type(value)}. Converting to string.")
            cleaned_meta[key] = str(value)
    return cleaned_meta

# --- Helper Function to Validate Record (MODIFIED) ---
def validate_record(record_dict, line_num):
    """Checks if a dictionary loaded from JSONL has the required structure
       for the NEW format and extracts key components."""
    if not isinstance(record_dict, dict):
        print(f"Warning: Line {line_num}: Expected a dictionary, got {type(record_dict)}. Skipping.")
        return None, None, None # Return Nones for all components

    embedding = record_dict.get("embedding")
    # Metadata fields are now top-level

    if not isinstance(embedding, list):
        print(f"Warning: Line {line_num}: 'embedding' field is missing or not a list. Skipping.")
        return None, None, None

    # --- ID Extraction (from top level) ---
    vector_id = record_dict.get("id") # Look for 'id' at the top level
    if not isinstance(vector_id, str) or not vector_id.strip():
        # Fallback: Try 'original_index' at the top level
        original_index = record_dict.get("original_index")
        if isinstance(original_index, int):
            vector_id = f"row_{original_index}" # Generate ID from index
            # print(f"Info: Line {line_num}: Using fallback ID '{vector_id}' as top-level 'id' was missing/invalid.")
        else:
            print(f"Warning: Line {line_num}: Cannot determine valid string ID from top-level 'id' or 'original_index'. Skipping.")
            return None, None, None

    # Return vector_id, embedding, and the original dict to extract metadata from later
    return vector_id, embedding, record_dict


# --- Helper function to extract text from conversation (NEW) ---
def extract_conversation_text(conversation_data):
    """Extracts and concatenates 'value' parts from the conversation structure."""
    if not isinstance(conversation_data, dict):
        return ""
    values = conversation_data.get("value", [])
    if not isinstance(values, list):
        return ""
    # Join only the string parts from the 'value' list
    text_parts = [str(v) for v in values if isinstance(v, str)]
    return " ".join(text_parts).strip()


# --- Main Upsert Logic ---
print("--- Starting Pinecone Upsert Process ---")

# --- Sanity Checks ---
if not PINECONE_API_KEY or PINECONE_API_KEY == "YOUR_API_KEY": exit("Error: Pinecone API key not set.")
if not PINECONE_INDEX_HOST or PINECONE_INDEX_HOST == "YOUR_INDEX_HOST": exit("Error: Pinecone index host not set.")
if not os.path.exists(INPUT_JSONL_FILE): exit(f"Error: Input file not found: '{INPUT_JSONL_FILE}'")

# --- Initialize Pinecone Connection ---
try:
    print(f"Initializing Pinecone connection...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    index = pc.Index(host=PINECONE_INDEX_HOST)
    print(f"Connected to Pinecone index host: {PINECONE_INDEX_HOST}")
    print("Initial index stats:", index.describe_index_stats())
except Exception as e:
    print(f"Error initializing Pinecone or connecting to index: {e}")
    traceback.print_exc()
    exit()

# --- Read JSONL and Upsert in Batches ---
batch_to_upsert = []
total_lines_read = 0
total_vectors_upserted = 0
total_skipped = 0

print(f"Reading from '{INPUT_JSONL_FILE}' and upserting to namespace '{PINECONE_NAMESPACE}'...")

try:
    with open(INPUT_JSONL_FILE, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            total_lines_read += 1
            line = line.strip()
            if not line: continue

            try:
                record_data = json.loads(line)
            except json.JSONDecodeError:
                print(f"Warning: Line {line_num}: Invalid JSON. Skipping.")
                total_skipped += 1
                continue

            # Validate and extract using the MODIFIED function
            # Now returns vector_id, embedding, and the original full record_data
            vector_id, vector_values, original_record_data = validate_record(record_data, line_num)

            if vector_id is None: # Skip if validation failed (ID or embedding missing)
                total_skipped += 1
                continue

            # --- Construct Metadata for Pinecone from top-level fields ---
            pinecone_metadata = {
                "caption": original_record_data.get("caption", ""), # Get caption if exists
                "conversation_text": extract_conversation_text(original_record_data.get("conversation")), # Get conversation text
                "original_index": original_record_data.get("original_index", -1), # Get original index
                "source_id": original_record_data.get("id", "") # Store the original ID field as well
                # Add any other top-level fields from your JSONL you want as metadata
            }

            # <<< --- CLEAN the *constructed* METADATA before upserting --- >>>
            cleaned_pinecone_metadata = clean_metadata(pinecone_metadata)

            # --- Prepare vector object for Pinecone ---
            vector_obj = {
                "id": vector_id, # The unique ID determined by validate_record
                "values": vector_values,
                "metadata": cleaned_pinecone_metadata # Use the cleaned, constructed metadata
            }
            batch_to_upsert.append(vector_obj)

            # --- Upsert when batch is full ---
            if len(batch_to_upsert) >= UPSERT_BATCH_SIZE:
                print(f"Upserting batch of {len(batch_to_upsert)} vectors (Total lines processed: {total_lines_read})...")
                try:
                    upsert_response = index.upsert(vectors=batch_to_upsert, namespace=PINECONE_NAMESPACE)
                    upserted_count = getattr(upsert_response, 'upserted_count', len(batch_to_upsert))
                    total_vectors_upserted += upserted_count
                    print(f"  Successfully upserted batch (reported count: {upserted_count}).")
                except Exception as e:
                    print(f"Error during Pinecone upsert batch (lines ~{line_num-UPSERT_BATCH_SIZE+1} to {line_num}):")
                    print(f"  Error Type: {type(e).__name__}")
                    print(f"  Error Details: {e}")
                    if "Metadata value" in str(e):
                         print("  Potential lingering metadata issue detected. Check cleaning logic or Pinecone metadata limits.")
                         # print("  Problematic batch metadata sample:")
                         # for item in batch_to_upsert[:5]: print(f"    ID: {item['id']}, Meta: {item['metadata']}")
                    print("  Skipping this batch and continuing...")
                    total_skipped += len(batch_to_upsert)
                finally:
                    batch_to_upsert = []

except FileNotFoundError: print(f"Error: Input file not found: {INPUT_JSONL_FILE}")
except IOError as e: print(f"Error reading file {INPUT_JSONL_FILE}: {e}")
except Exception as e: print(f"An unexpected error occurred during file processing: {e}"); traceback.print_exc()

# --- Upsert final batch ---
if batch_to_upsert:
    print(f"Upserting final batch of {len(batch_to_upsert)} vectors...")
    try:
        upsert_response = index.upsert(vectors=batch_to_upsert, namespace=PINECONE_NAMESPACE)
        upserted_count = getattr(upsert_response, 'upserted_count', len(batch_to_upsert))
        total_vectors_upserted += upserted_count
        print(f"  Successfully upserted final batch (reported count: {upserted_count}).")
    except Exception as e:
        print(f"Error during final Pinecone upsert batch:")
        print(f"  Error Type: {type(e).__name__}")
        print(f"  Error Details: {e}")
        total_skipped += len(batch_to_upsert)
    finally:
        batch_to_upsert = []

# --- Final Summary ---
print("\n--- Pinecone Upsert Summary ---")
print(f"Input File:          '{INPUT_JSONL_FILE}'")
print(f"Target Namespace:    '{PINECONE_NAMESPACE}'")
print(f"Total Lines Read:    {total_lines_read}")
print(f"Total Vectors Upserted: {total_vectors_upserted}")
print(f"Total Records Skipped: {total_skipped}")
print("-------------------------------")

# Optional: Verify final count
try:
    print("Fetching final index stats...")
    final_stats = index.describe_index_stats()
    print("Final index stats:", final_stats)
    namespace_stats = final_stats.namespaces.get(PINECONE_NAMESPACE) if final_stats.namespaces else None
    if namespace_stats: print(f"Vector count in namespace '{PINECONE_NAMESPACE}': {namespace_stats.vector_count}")
    else: print(f"Namespace '{PINECONE_NAMESPACE}' not found in index stats.")
except Exception as e: print(f"Could not fetch final index stats: {e}")

print("Script finished.")

--- Starting Pinecone Upsert Process ---
Initializing Pinecone connection...
Connected to Pinecone index host: https://text-collection-4arfk3f.svc.aped-4627-b74a.pinecone.io
Initial index stats: {'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'astrollava-embeddings': {'vector_count': 26260},
                'hubble-embeddings': {'vector_count': 2592}},
 'total_vector_count': 28852,
 'vector_type': 'dense'}
Reading from 'astro_embeddings_clip_new.jsonl' and upserting to namespace 'clip-embeddings'...
Upserting batch of 100 vectors (Total lines processed: 100)...
  Successfully upserted batch (reported count: 100).
Upserting batch of 100 vectors (Total lines processed: 200)...
  Successfully upserted batch (reported count: 100).
Upserting batch of 100 vectors (Total lines processed: 300)...
  Successfully upserted batch (reported count: 100).
Upserting batch of 100 vectors (Total lines processed: 400)...
  Successfully upserted batch (reported count: 100)