In [2]:
%%capture
pip install llama-index openai pinecone-client

In [3]:
import openai
import os
from constants import openai_key

os.environ["OPENAI_API_KEY"] = openai_key   
openai.api_key = os.environ["OPENAI_API_KEY"]


In [4]:
import nest_asyncio

nest_asyncio.apply()

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [5]:
from llama_index import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    SQLDatabase,
)

In [8]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="pinecone_api_key")

# pc.create_index(
#     name="quickstart",
#     dimension=8, # Replace with your model dimensions
#     metric="euclidean", # Replace with your model metric
#     spec=ServerlessSpec(
#         cloud="aws",
#         region="us-west-2"
#     ) 
# )


In [9]:
pc.describe_index("quickstart")

{'dimension': 8,
 'host': 'quickstart-io58o92.svc.apw5-4e34-81fa.pinecone.io',
 'metric': 'euclidean',
 'name': 'quickstart',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-west-2'}},
 'status': {'ready': True, 'state': 'Ready'}}

In [10]:
pc_index = pc.Index('quickstart')

In [11]:
from llama_index import ServiceContext
from llama_index.storage import StorageContext
from llama_index.vector_stores import PineconeVectorStore
from llama_index.node_parser import TokenTextSplitter
from llama_index.llms import OpenAI

# define node parser and LLM
chunk_size = 1024
llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", streaming=True)
service_context = ServiceContext.from_defaults(chunk_size=chunk_size, llm=llm)
node_parser = TokenTextSplitter(chunk_size=chunk_size)

# define pinecone vector index
vector_store = PineconeVectorStore(
    pinecone_index=pc_index, namespace="ipeds"
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_index = VectorStoreIndex([], storage_context=storage_context)


In [12]:
import sqlalchemy
from sqlalchemy import create_engine, MetaData
from sqlalchemy.orm import sessionmaker

def connect_to_database():
    # Database connection URL
    url = 'postgresql+psycopg2://USER:PASSWORD@HOST:PORT/DB_NAME'

    # Creating a SQLAlchemy engine
    engine = create_engine(url)

    # Creating session
    Session = sessionmaker(bind=engine)
    session = Session()

    # connection and a cursor
    connection = engine.connect()
    cursor = connection.connection.cursor()

    return connection, cursor, engine

def execute_query(cursor, query):
    # Executing the query
    cursor.execute(query)

    # Commiting the changes
    connection.commit()

    # Fetching the data
    result = cursor.fetchall()

    return result

# SQL Query
sql_query = '''
        Select IC.campusid, IC.pcaddr, IC.pccity
        from public."ADM2022" as ADM
        inner join public."IC2022_CAMPUSES" as IC
        on ADM.unitid = IC.index
        where ADM.admcon8 = 1
        limit 1;    
'''

# Function to print table names using Metadata
def print_table_names(engine):
    metadata = MetaData()
    metadata.reflect(bind=engine)
    table_names = metadata.tables.keys()

    print("Table Names:")
    for table_name in table_names:
        print(table_name)

# Connect to the database
connection, cursor, engine = connect_to_database()

# Execute the SQL query
results = execute_query(cursor, sql_query)

# Display the results
print("Results:")
for row in results:
    print(row)

# Print all table names
print_table_names(engine)

# Close the cursor and connection when done
cursor.close()
connection.close()





Results:
('Troy University-Phenix City Campus', 'Phenix City', 'AL')
Table Names:
EFFY2022
C2022_C
SFA2122
GR2022
C2022_B
GR200_22
OM2022
GR2022_PELL_SSL
IC2022
SFAV2122
C2022DEP
EFIA2022
C2022_A
HD2022
IC2022_PY
EFFY2022_DIST
ADM2022
FLAGS2022
IC2022_CAMPUSES
GR2022_L2
IC2022_AY


In [13]:
from sqlalchemy import create_engine, MetaData, Table
from llama_index.indices.struct_store.sql_query import NLSQLTableQueryEngine

# # Replace 'USER', 'PASSWORD', and 'your_database_name' with your actual credentials and database name
# database_url = 'postgresql+psycopg2://USER:PASSWORD@db-postgresql-nyc3-10726-do-user-15531455-0.c.db.ondigitalocean.com:25060/your_database_name'
# engine = create_engine(database_url)

# List of table names
table_names = [
    "EFFY2022",
    "C2022_C",
    "SFA2122",
    "GR2022",
    "C2022_B",
    "GR200_22",
    "OM2022",
    "GR2022_PELL_SSL",
    "IC2022",
    "SFAV2122",
    "C2022DEP",
    "EFIA2022",
    "C2022_A",
    "HD2022",
    "IC2022_PY",
    "EFFY2022_DIST",
    "ADM2022",
    "FLAGS2022",
    "IC2022_CAMPUSES",
    "GR2022_L2",
    "IC2022_AY"
]

# Create a MetaData object and load tables
metadata = MetaData()
tables = {table_name: Table(table_name, metadata, autoload_with=engine) for table_name in table_names}

# Create SQL index
sql_database = SQLDatabase(engine, include_tables=table_names)
sql_query_engine = NLSQLTableQueryEngine(sql_database=sql_database, tables=table_names)


In [15]:
for EFFY2022 in zip(table_names):
    nodes = node_parser.get_nodes_from_documents([EFFY2022])

    for node in nodes:
        node.metadata = {"title" : EFFY2022}
    vector_index.insert(nodes)


AttributeError: 'tuple' object has no attribute 'id_'