Create a test sql database from titanic dataset.

https://python.langchain.com/docs/use_cases/sql/csv/

In [1]:
import pandas as pd
from pyprojroot import here

In [2]:
df = pd.read_csv(here("data/for_upload/ocean_1.csv"))
print(df.shape)
print(df.columns.tolist())
display(df.head(3))

(1793, 13)
['id', 'type', 'cast_id', 'latitude', 'longitude', 'date', 'time', 'depth', 'temperature', 'salinity', 'pressure', 'dissolved_oxygen', 'region']


Unnamed: 0,id,type,cast_id,latitude,longitude,date,time,depth,temperature,salinity,pressure,dissolved_oxygen,region
0,CTD_Southern_Indian_Ocean_1_20240317_1,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,0.0,28.534,34.93,0.99,6.97,Southern Indian Ocean
1,CTD_Southern_Indian_Ocean_1_20240317_2,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,92.4,19.654,34.476,10.22,5.78,Southern Indian Ocean
2,CTD_Southern_Indian_Ocean_1_20240317_3,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,297.1,9.895,34.671,30.7,4.45,Southern Indian Ocean


### **SQL**

Using SQL to interact with CSV data is the recommended approach because it is easier to limit permissions and sanitize queries than with arbitrary Python.

Most SQL databases make it easy to load a CSV file in as a table (DuckDB, SQLite, etc.). Once you’ve done this you can use all of the chain and agent-creating techniques outlined in the SQL use case guide. Here’s a quick example of how we might do this with SQLite:

In [7]:
from langchain_community.utilities import SQLDatabase
from sqlalchemy import create_engine
db_path = str(here("data")) + "/test_sqldb.db"
db_path = f"sqlite:///{db_path}"

engine = create_engine(db_path)
# df.to_sql("titanic", engine, index=False)
df.to_sql("ocean_1", engine, index=False)

1793

For multiple csv files, we can create a sql with multiple tables:
```
df1.to_sql("csv1_name", engine, index=False)
df2.to_sql("csv2_name", engine, index=False)
```

In [9]:
db = SQLDatabase(engine=engine)
print(db.dialect)
print(db.get_usable_table_names())
db.run("SELECT * FROM ocean_1 WHERE temperature < 2;")

sqlite
['ocean_1', 'titanic']


"[('CTD_Southern_Indian_Ocean_1_20240317_8', 'CTD_Cast', 'CTD_Southern_Indian_Ocean_1_20240317', -29.4225, 80.3244, '2024-03-17', '02:11:34', 1028.4, 1.71, 34.794, 103.85, 2.64, 'Southern Indian Ocean'), ('CTD_Southern_Indian_Ocean_1_20240317_11', 'CTD_Cast', 'CTD_Southern_Indian_Ocean_1_20240317', -29.4225, 80.3244, '2024-03-17', '02:11:34', 1418.4, 1.91, 34.795, 142.8, 2.02, 'Southern Indian Ocean'), ('CTD_Southern_Indian_Ocean_1_20240317_13', 'CTD_Cast', 'CTD_Southern_Indian_Ocean_1_20240317', -29.4225, 80.3244, '2024-03-17', '02:11:34', 1304.3, 1.612, 34.907, 131.41, 2.28, 'Southern Indian Ocean'), ('CTD_Southern_Indian_Ocean_1_20240317_15', 'CTD_Cast', 'CTD_Southern_Indian_Ocean_1_20240317', -29.4225, 80.3244, '2024-03-17', '02:11:34', 1669.3, 1.991, 34.905, 167.93, 2.14, 'Southern Indian Ocean'), ('CTD_Southern_Indian_Ocean_1_20240317_16', 'CTD_Cast', 'CTD_Southern_Indian_Ocean_1_20240317', -29.4225, 80.3244, '2024-03-17', '02:11:34', 1765.6, 1.558, 34.902, 177.58, 2.06, 'Souther

**Equivalent in Pandas**

In [10]:
df[df["temperature"]<2]

Unnamed: 0,id,type,cast_id,latitude,longitude,date,time,depth,temperature,salinity,pressure,dissolved_oxygen,region
7,CTD_Southern_Indian_Ocean_1_20240317_8,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,1028.4,1.710,34.794,103.85,2.64,Southern Indian Ocean
10,CTD_Southern_Indian_Ocean_1_20240317_11,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,1418.4,1.910,34.795,142.80,2.02,Southern Indian Ocean
12,CTD_Southern_Indian_Ocean_1_20240317_13,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,1304.3,1.612,34.907,131.41,2.28,Southern Indian Ocean
14,CTD_Southern_Indian_Ocean_1_20240317_15,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,1669.3,1.991,34.905,167.93,2.14,Southern Indian Ocean
15,CTD_Southern_Indian_Ocean_1_20240317_16,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,1765.6,1.558,34.902,177.58,2.06,Southern Indian Ocean
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1783,CTD_Equatorial_Indian_Ocean_76_20250824_15,CTD_Cast,CTD_Equatorial_Indian_Ocean_76_20250824,7.8635,73.1906,2025-08-24,13:24:56,2073.6,1.508,34.914,208.33,2.94,Equatorial Indian Ocean
1785,CTD_Equatorial_Indian_Ocean_76_20250824_17,CTD_Cast,CTD_Equatorial_Indian_Ocean_76_20250824,7.8635,73.1906,2025-08-24,13:24:56,1418.6,1.689,34.808,142.82,2.24,Equatorial Indian Ocean
1786,CTD_Equatorial_Indian_Ocean_76_20250824_18,CTD_Cast,CTD_Equatorial_Indian_Ocean_76_20250824,7.8635,73.1906,2025-08-24,13:24:56,1607.6,1.679,34.603,161.77,2.91,Equatorial Indian Ocean
1788,CTD_Equatorial_Indian_Ocean_76_20250824_20,CTD_Cast,CTD_Equatorial_Indian_Ocean_76_20250824,7.8635,73.1906,2025-08-24,13:24:56,1632.3,1.792,34.766,164.21,2.45,Equatorial Indian Ocean


### **Create an agent to interact with the Database**

In [11]:
import os
from dotenv import load_dotenv
import warnings
warnings.filterwarnings("ignore")
print("Environment variables are loaded:", load_dotenv())
print("test by reading a variable:", os.getenv("GOOGLE_API_KEY"))


Environment variables are loaded: True
test by reading a variable: AIzaSyDjRyVPfWz2z4U7hnsORVyQwV3o_6U-uXc


In [12]:
from langchain_google_genai import ChatGoogleGenerativeAI

google_api_key = os.environ["GOOGLE_API_KEY"]
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",  
    google_api_key=google_api_key,
    temperature=0.0
)

In [13]:
from langchain_community.agent_toolkits import create_sql_agent
agent_executor = create_sql_agent(llm, db=db, agent_type="tool-calling", verbose=True)

In [14]:
agent_executor.invoke({"input": "what is the average value of depth"})



[1m> Entering new SQL Agent Executor chain...[0m
[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`


[0m[38;5;200m[1;3mocean_1, titanic[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'ocean_1'}`


[0m[33;1m[1;3m
CREATE TABLE ocean_1 (
	id TEXT, 
	type TEXT, 
	cast_id TEXT, 
	latitude FLOAT, 
	longitude FLOAT, 
	date TEXT, 
	time TEXT, 
	depth FLOAT, 
	temperature FLOAT, 
	salinity FLOAT, 
	pressure FLOAT, 
	dissolved_oxygen FLOAT, 
	region TEXT
)

/*
3 rows from ocean_1 table:
id	type	cast_id	latitude	longitude	date	time	depth	temperature	salinity	pressure	dissolved_oxygen	region
CTD_Southern_Indian_Ocean_1_20240317_1	CTD_Cast	CTD_Southern_Indian_Ocean_1_20240317	-29.4225	80.3244	2024-03-17	02:11:34	0.0	28.534	34.93	0.99	6.97	Southern Indian Ocean
CTD_Southern_Indian_Ocean_1_20240317_2	CTD_Cast	CTD_Southern_Indian_Ocean_1_20240317	-29.4225	80.3244	2024-03-17	02:11:34	92.4	19.654	34.476	10.22	5.78	Southern Indian Ocean
CTD_Southern_Indian_Ocean_1_2024031

{'input': 'what is the average value of depth',
 'output': 'The average depth is 1213.53 meters.\n'}