In [13]:
import pandas as pd
import pinecone
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import os
from pinecone import Pinecone, ServerlessSpec

# Initialize Pinecone
pc = Pinecone(
    api_key=os.environ.get("PINECONE_API_KEY")
)

# Create or connect to an index
index_name = 'acs-tables'  # Choose an index name
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name, 
        dimension=384, 
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
index = pc.Index(index_name)

In [14]:
df = pd.read_csv('ACS_2022_tables_mini.csv')

In [15]:
# Ensure columns are of type string
df[['name', 'title', 'universe']] = df[['name', 'title', 'universe']].astype(str)

# Combine columns into a 'text' column
df['text'] = df[['name', 'title', 'universe']].agg(' '.join, axis=1)


In [16]:
# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the 'text' column
df['embedding'] = df['text'].apply(lambda x: model.encode(x).tolist())


In [18]:
data_to_upsert = [
    (
        str(idx),  # Use index as the unique ID
        row['embedding'],
        {
            'name': row['name'],
            'title': row['title'],
            'universe': row['universe'],
            'text': row['text']
        }
    )
    for idx, row in df.iterrows()
]


In [19]:
# Upsert data into Pinecone
index.upsert(vectors=data_to_upsert)


{'upserted_count': 248}

In [None]:
df.head()

Unnamed: 0,name,title,universe,text,embedding
0,G001,Geography Identifiers,none,G001 Geography Identifiers none,"[0.02329733781516552, -0.01526133343577385, 0...."
1,A00001,Total Population,Total Population,A00001 Total Population Total Population,"[0.04868048056960106, -0.04043744131922722, -0..."
2,A00002,Population Density (Per Sq. Mile),Total Population,A00002 Population Density (Per Sq. Mile) Total...,"[0.1295415163040161, -0.08986365795135498, -0...."
3,A00003,Land Area (Sq. Miles),Area Total,A00003 Land Area (Sq. Miles) Area Total,"[0.13329695165157318, -0.05397973209619522, -0..."
4,A02001,Sex,Total Population,A02001 Sex Total Population,"[0.022333547472953796, -0.021919462829828262, ..."
