In [1]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [9]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.getenv('OPENAI_API_KEY')

In [3]:
import datetime
current_date=datetime.datetime.now().date()
target_date=datetime.date(2024,6,12)
if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

In [4]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display,Markdown

In [5]:
from langchain.indexes import VectorstoreIndexCreator

In [31]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader,SummaryIndex

documents = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(documents)

Failed to load file data\~$data.docx with error: File is not a zip file. Skipping...


In [11]:
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
print(response)

The author mentioned that growing up, they worked on two main things outside of school: writing and programming. They wrote short stories and also tried writing programs on an IBM 1401 computer. They used an early version of Fortran and had to type programs on punch cards. They also mentioned getting a microcomputer later on and started programming more extensively, writing simple games and a word processor.


In [12]:
query_engine = index.as_query_engine()
response = query_engine.query("where author working?")
print(response)

The author worked on a project called Bel, which was mostly written in England.


In [14]:
query_engine = index.as_query_engine()
response = query_engine.query('''In the beginning, the author mentions working on short
stories before college. What was the nature of these
short stories, and how does the author describe them?''')
print(response)

The author describes the short stories they worked on before college as "awful" and lacking in plot. They mention that the stories mainly focused on characters with strong feelings, which the author believed made them deep.


In [15]:
query_engine = index.as_query_engine()
response = query_engine.query('''The author talks about programming on an IBM 1401 in 9th grade. 
                              What language did they use, and what challenges did they face in using this machine?''')
print(response)

The author used an early version of Fortran to program on the IBM 1401 in 9th grade. They faced challenges in figuring out what to do with the machine since the only form of input was data stored on punched cards, and they didn't have any data stored on punched cards. The only other option was to do things that didn't rely on any input, like calculating approximations of pi, but they didn't know enough math to do anything interesting of that type.


### Summarization

In [32]:
index = SummaryIndex.from_documents(documents)

query_engine = index.as_query_engine(response_mode="tree_summarize")
response = query_engine.query("What is a summary of this collection of text?")


In [33]:
print(response)

The collection of text provides a narrative of the author's journey through various experiences, including writing, programming, artificial intelligence, art, and entrepreneurship. It starts with the author's early experiences with writing and programming, leading to their fascination with microcomputers and eventual interest in AI. However, the author becomes more interested in Lisp programming and decides to focus on it, eventually writing a book about Lisp hacking. The text then shifts to the author's experiences as an art student, their involvement in the art world, and exploration of the World Wide Web. The author's venture of putting art galleries online is not successful, leading them to pivot and start a company called Viaweb, which builds online stores. The challenges, growth, and eventual acquisition of Viaweb by Yahoo are discussed. The author eventually leaves the company to pursue painting and starts their own investment firm. The text also touches on the founding and grow

### Queries over Structured Data

In [56]:
from sqlalchemy import (
    create_engine,
    MetaData,
    Table,
    Column,
    String,
    Integer,
    select,
    column,
)

engine = create_engine("sqlite:///:memory:")
metadata_obj = MetaData()

In [57]:
# create city SQL table
table_name = "city_stats"
city_stats_table = Table(
    table_name,
    metadata_obj,
    Column("city_name", String(16), primary_key=True),
    Column("population", Integer),
    Column("country", String(16), nullable=False),
)
metadata_obj.create_all(engine)

In [58]:
from sqlalchemy import insert

rows = [
    {"city_name": "Toronto", "population": 2731571, "country": "Canada"},
    {"city_name": "Tokyo", "population": 13929286, "country": "Japan"},
    {"city_name": "Berlin", "population": 600000, "country": "Germany"},
]
for row in rows:
    stmt = insert(city_stats_table).values(**row)
    with engine.begin() as connection:
        cursor = connection.execute(stmt)

In [59]:
from llama_index import SQLDatabase

sql_database=SQLDatabase(engine,include_tables=['city_stats'])

### natural language sql query

In [60]:
from llama_index.indices.struct_store import NLSQLTableQueryEngine

In [70]:
query_engine=NLSQLTableQueryEngine(
sql_database=sql_database,
tables=['city_stats'],
)
query_str="Which city has the highest population and give the  number of population of that city?"
response=query_engine.query(query_str)
print(response)

The city with the highest population is Tokyo, with a population of 13,929,286.


In [75]:
query_engine=NLSQLTableQueryEngine(
sql_database=sql_database,
tables=['city_stats'],
)
query_str="give the names present city and country in table format with index number ?"
response=query_engine.query(query_str)
print(response)

Index   City Name   Country
1         Toronto        Canada
2         Tokyo          Japan
3         Berlin         Germany


In [76]:
query_engine=NLSQLTableQueryEngine(
sql_database=sql_database,
tables=['city_stats'],
)
query_str="canada is country of berlin right ?"
response=query_engine.query(query_str)
print(response)

No, Berlin is not a city in Canada.


In [77]:
query_engine=NLSQLTableQueryEngine(
sql_database=sql_database,
tables=['city_stats'],
)
query_str="canada have huge populationtn isn't it ?"
response=query_engine.query(query_str)
print(response)

I'm sorry, but there is no data available for the population of Canada in the city_stats table.


In [81]:
query_engine=NLSQLTableQueryEngine(
sql_database=sql_database,
tables=['city_stats'],
)
query_str="Berlin  have huge populationtn isn't it and highest of compare to other city ? then what about Tokyo?"
response=query_engine.query(query_str)
print(response)

Tokyo actually has a much larger population compared to Berlin. According to the city statistics, Tokyo has a population of approximately 13,929,286, while Berlin has a population of around 600,000.
