Create a test sql database from titanic dataset.

https://python.langchain.com/docs/use_cases/sql/csv/

In [7]:
import pandas as pd
from pyprojroot import here


### **Connecting to SQL Server**

There are several ways to connect to SQL Server from Python. Here are the most common methods:

In [8]:
# Method 1: Using pyodbc (recommended for SQL Server)
import pyodbc
from sqlalchemy import create_engine
from langchain_community.utilities import SQLDatabase

# Connection parameters
server = "your_server_name"  # e.g., "localhost" or "server.domain.com"
database = "your_database_name"
username = "your_username"
password = "your_password"

# Option 1a: Direct pyodbc connection
connection_string = (
    f"DRIVER={{ODBC Driver 17 for SQL Server}};"
    f"SERVER={server};"
    f"DATABASE={database};"
    f"UID={username};"
    f"PWD={password};"
)

try:
    conn = pyodbc.connect(connection_string)
    print("Connected to SQL Server successfully!")
    conn.close()
except Exception as e:
    print(f"Error connecting to SQL Server: {e}")

# Option 1b: Using SQLAlchemy with pyodbc
sqlalchemy_url = f"mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server"
engine = create_engine(sqlalchemy_url)

# Create LangChain SQLDatabase object
db = SQLDatabase(engine=engine)
print(f"Database dialect: {db.dialect}")
print(f"Available tables: {db.get_usable_table_names()}")

ModuleNotFoundError: No module named 'pyodbc'

In [None]:
# Method 2: Using pymssql (alternative driver)
import pymssql
from sqlalchemy import create_engine

# Connection parameters (same as above)
server = "your_server_name"
database = "your_database_name"
username = "your_username"
password = "your_password"

# Option 2a: Direct pymssql connection
try:
    conn = pymssql.connect(
        server=server,
        user=username,
        password=password,
        database=database
    )
    print("Connected to SQL Server with pymssql successfully!")
    conn.close()
except Exception as e:
    print(f"Error connecting with pymssql: {e}")

# Option 2b: Using SQLAlchemy with pymssql
sqlalchemy_url_pymssql = f"mssql+pymssql://{username}:{password}@{server}/{database}"
engine_pymssql = create_engine(sqlalchemy_url_pymssql)

# Create LangChain SQLDatabase object
db_pymssql = SQLDatabase(engine=engine_pymssql)

In [None]:
df = pd.read_csv(here("data/for_upload/titanic.csv"))
print(df.shape)
print(df.columns.tolist())
display(df.head(3))

(887, 8)
['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']


Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925


### **SQL**

Using SQL to interact with CSV data is the recommended approach because it is easier to limit permissions and sanitize queries than with arbitrary Python.

Most SQL databases make it easy to load a CSV file in as a table (DuckDB, SQLite, etc.). Once you’ve done this you can use all of the chain and agent-creating techniques outlined in the SQL use case guide. Here’s a quick example of how we might do this with SQLite:

In [None]:
from langchain_community.utilities import SQLDatabase
from sqlalchemy import create_engine
db_path = str(here("data")) + "/test_sqldb.db"
db_path = f"sqlite:///{db_path}"

engine = create_engine(db_path)
# df.to_sql("titanic", engine, index=False)
df.to_sql("titanic", engine, index=False)

887

For multiple csv files, we can create a sql with multiple tables:
```
df1.to_sql("csv1_name", engine, index=False)
df2.to_sql("csv2_name", engine, index=False)
```

In [None]:
db = SQLDatabase(engine=engine)
print(db.dialect)
print(db.get_usable_table_names())
db.run("SELECT * FROM titanic WHERE Age < 2;")

sqlite
['titanic']


"[(1, 2, 'Master. Alden Gates Caldwell', 'male', 0.83, 0, 2, 29.0), (0, 3, 'Master. Eino Viljami Panula', 'male', 1.0, 4, 1, 39.6875), (1, 3, 'Miss. Eleanor Ileen Johnson', 'female', 1.0, 1, 1, 11.1333), (1, 2, 'Master. Richard F Becker', 'male', 1.0, 2, 1, 39.0), (1, 1, 'Master. Hudson Trevor Allison', 'male', 0.92, 1, 2, 151.55), (1, 3, 'Miss. Maria Nakid', 'female', 1.0, 0, 2, 15.7417), (0, 3, 'Master. Sidney Leonard Goodwin', 'male', 1.0, 5, 2, 46.9), (1, 3, 'Miss. Helene Barbara Baclini', 'female', 0.75, 2, 1, 19.2583), (1, 3, 'Miss. Eugenie Baclini', 'female', 0.75, 2, 1, 19.2583), (1, 2, 'Master. Viljo Hamalainen', 'male', 0.67, 1, 1, 14.5), (1, 3, 'Master. Bertram Vere Dean', 'male', 1.0, 1, 2, 20.575), (1, 3, 'Master. Assad Alexander Thomas', 'male', 0.42, 0, 1, 8.5167), (1, 2, 'Master. Andre Mallet', 'male', 1.0, 0, 2, 37.0042), (1, 2, 'Master. George Sibley Richards', 'male', 0.83, 1, 1, 18.75)]"

**Equivalent in Pandas**

In [None]:
df[df["Age"]<2]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
77,1,2,Master. Alden Gates Caldwell,male,0.83,0,2,29.0
163,0,3,Master. Eino Viljami Panula,male,1.0,4,1,39.6875
171,1,3,Miss. Eleanor Ileen Johnson,female,1.0,1,1,11.1333
182,1,2,Master. Richard F Becker,male,1.0,2,1,39.0
303,1,1,Master. Hudson Trevor Allison,male,0.92,1,2,151.55
379,1,3,Miss. Maria Nakid,female,1.0,0,2,15.7417
384,0,3,Master. Sidney Leonard Goodwin,male,1.0,5,2,46.9
466,1,3,Miss. Helene Barbara Baclini,female,0.75,2,1,19.2583
641,1,3,Miss. Eugenie Baclini,female,0.75,2,1,19.2583
751,1,2,Master. Viljo Hamalainen,male,0.67,1,1,14.5


### **Create an agent to interact with the Database**

In [None]:
import os
from langchain_openai import ChatOpenAI


os.environ['GITHUB_TOKEN'] = ""  # Replace with your actual GitHub token
token = os.environ.get("GITHUB_TOKEN")
endpoint = "https://models.github.ai/inference"
model_name = "openai/gpt-4.1-mini" 

if not token:
    raise ValueError("GITHUB_TOKEN environment variable not set. Please provide a valid token.")

llm =ChatOpenAI(
    model_name=model_name,
    openai_api_key=token,
    openai_api_base=endpoint,
    temperature=0.5,
)

In [11]:
from langchain_community.agent_toolkits import create_sql_agent
agent_executor = create_sql_agent(llm, db=db, agent_type="openai-tools", verbose=True)

In [12]:
agent_executor.invoke({"input": "Tell me more about Anders Johan Andersson"})



[1m> Entering new SQL Agent Executor chain...[0m
[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`


[0m[38;5;200m[1;3mtitanic[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'titanic'}`


[0m[33;1m[1;3m
CREATE TABLE titanic (
	"Survived" BIGINT, 
	"Pclass" BIGINT, 
	"Name" TEXT, 
	"Sex" TEXT, 
	"Age" FLOAT, 
	"Siblings/Spouses Aboard" BIGINT, 
	"Parents/Children Aboard" BIGINT, 
	"Fare" FLOAT
)

/*
3 rows from titanic table:
Survived	Pclass	Name	Sex	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare
0	3	Mr. Owen Harris Braund	male	22.0	1	0	7.25
1	1	Mrs. John Bradley (Florence Briggs Thayer) Cumings	female	38.0	1	0	71.2833
1	3	Miss. Laina Heikkinen	female	26.0	0	0	7.925
*/[0m[32;1m[1;3m
Invoking: `sql_db_query` with `{'query': 'SELECT Survived, Pclass, Name, Sex, Age, "Siblings/Spouses Aboard", "Parents/Children Aboard", Fare FROM titanic WHERE Name LIKE \'%Anders Johan Andersson%\' LIMIT 10;'}`


[0m[36;1m[1;3m[(0, 3, 'Mr. Anders Johan Ander

{'input': 'Tell me more about Anders Johan Andersson',
 'output': 'Anders Johan Andersson was a male passenger on the Titanic. He was 39 years old, traveled in 3rd class, and did not survive. He had 1 sibling/spouse aboard and 5 parents/children aboard. The fare for his ticket was 31.275.'}

In [13]:
agent_executor.invoke({"input": "what's the average age of survivors"})



[1m> Entering new SQL Agent Executor chain...[0m
[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`


[0m[38;5;200m[1;3mtitanic[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'titanic'}`


[0m[33;1m[1;3m
CREATE TABLE titanic (
	"Survived" BIGINT, 
	"Pclass" BIGINT, 
	"Name" TEXT, 
	"Sex" TEXT, 
	"Age" FLOAT, 
	"Siblings/Spouses Aboard" BIGINT, 
	"Parents/Children Aboard" BIGINT, 
	"Fare" FLOAT
)

/*
3 rows from titanic table:
Survived	Pclass	Name	Sex	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare
0	3	Mr. Owen Harris Braund	male	22.0	1	0	7.25
1	1	Mrs. John Bradley (Florence Briggs Thayer) Cumings	female	38.0	1	0	71.2833
1	3	Miss. Laina Heikkinen	female	26.0	0	0	7.925
*/[0m[32;1m[1;3m
Invoking: `sql_db_query_checker` with `{'query': 'SELECT AVG(Age) AS average_age_survivors FROM titanic WHERE Survived = 1;'}`


[0m[36;1m[1;3mSELECT AVG(Age) AS average_age_survivors FROM titanic WHERE Survived = 1;[0m[32;1m[1;3m
Invoking: `sql_db_query` w

{'input': "what's the average age of survivors",
 'output': 'The average age of survivors is approximately 28.41 years.'}

In [14]:
# Equivalence in Pandas
df[df["Survived"]==1]["Age"].mean()

np.float64(28.408391812865496)

### **Complex Query 1: Survival rate by passenger class**

### **Dataset Overview**
Before running complex queries, let's verify what columns are available in our dataset.

In [None]:
# Display dataset information
print("Dataset shape:", df.shape)
print("\nColumn names and types:")
print(df.dtypes)
print("\nFirst few rows:")
display(df.head(3))
print("\nColumn list:", df.columns.tolist())

In [15]:
agent_executor.invoke({
    "input": "What is the survival rate for each passenger class? Show the class, total passengers, survivors, and survival rate percentage."
})



[1m> Entering new SQL Agent Executor chain...[0m
[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`


[0m[38;5;200m[1;3mtitanic[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'titanic'}`


[0m[33;1m[1;3m
CREATE TABLE titanic (
	"Survived" BIGINT, 
	"Pclass" BIGINT, 
	"Name" TEXT, 
	"Sex" TEXT, 
	"Age" FLOAT, 
	"Siblings/Spouses Aboard" BIGINT, 
	"Parents/Children Aboard" BIGINT, 
	"Fare" FLOAT
)

/*
3 rows from titanic table:
Survived	Pclass	Name	Sex	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare
0	3	Mr. Owen Harris Braund	male	22.0	1	0	7.25
1	1	Mrs. John Bradley (Florence Briggs Thayer) Cumings	female	38.0	1	0	71.2833
1	3	Miss. Laina Heikkinen	female	26.0	0	0	7.925
*/[0m[32;1m[1;3m
Invoking: `sql_db_query_checker` with `{'query': 'SELECT Pclass AS class, COUNT(*) AS total_passengers, SUM(Survived) AS survivors, ROUND(100.0 * SUM(Survived) / COUNT(*), 2) AS survival_rate_percentage FROM titanic GROUP BY Pclass ORDER BY Pclass;'}`


[0m[36;

{'input': 'What is the survival rate for each passenger class? Show the class, total passengers, survivors, and survival rate percentage.',
 'output': 'Here is the survival rate for each passenger class on the Titanic:\n\n- Class 1: Total Passengers = 216, Survivors = 136, Survival Rate = 62.96%\n- Class 2: Total Passengers = 184, Survivors = 87, Survival Rate = 47.28%\n- Class 3: Total Passengers = 487, Survivors = 119, Survival Rate = 24.44%'}

In [18]:
# Pandas verification
# Reload data to ensure we have fresh copy
df = pd.read_csv(here("data/for_upload/titanic.csv"))

survival_by_class = df.groupby('Pclass').agg(
    Total_Passengers=('Survived', 'count'),
    Survivors=('Survived', 'sum'),
    Survival_Rate=('Survived', 'mean')
).round(4)
survival_by_class['Survival_Rate_Pct'] = (survival_by_class['Survival_Rate'] * 100).round(2)
print("Pandas verification:")
display(survival_by_class)

Pandas verification:


Unnamed: 0_level_0,Total_Passengers,Survivors,Survival_Rate,Survival_Rate_Pct
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,216,136,0.6296,62.96
2,184,87,0.4728,47.28
3,487,119,0.2444,24.44


### **Complex Query 2: Gender and class survival analysis**

In [19]:
agent_executor.invoke({
    "input": "Compare survival rates between male and female passengers for each passenger class. Show gender, class, and survival rate."
})



[1m> Entering new SQL Agent Executor chain...[0m
[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`


[0m[38;5;200m[1;3mtitanic[0m[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`


[0m[38;5;200m[1;3mtitanic[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'titanic'}`


[0m[33;1m[1;3m
CREATE TABLE titanic (
	"Survived" BIGINT, 
	"Pclass" BIGINT, 
	"Name" TEXT, 
	"Sex" TEXT, 
	"Age" FLOAT, 
	"Siblings/Spouses Aboard" BIGINT, 
	"Parents/Children Aboard" BIGINT, 
	"Fare" FLOAT
)

/*
3 rows from titanic table:
Survived	Pclass	Name	Sex	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare
0	3	Mr. Owen Harris Braund	male	22.0	1	0	7.25
1	1	Mrs. John Bradley (Florence Briggs Thayer) Cumings	female	38.0	1	0	71.2833
1	3	Miss. Laina Heikkinen	female	26.0	0	0	7.925
*/[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'titanic'}`


[0m[33;1m[1;3m
CREATE TABLE titanic (
	"Survived" BIGINT, 
	"Pclass" BIGINT, 
	"Name" TEXT, 
	"Sex" TEXT, 
	"

{'input': 'Compare survival rates between male and female passengers for each passenger class. Show gender, class, and survival rate.',
 'output': 'The survival rates between male and female passengers for each passenger class are as follows:\n\n- First Class: Female survival rate is 96.81%, Male survival rate is 36.89%\n- Second Class: Female survival rate is 92.11%, Male survival rate is 15.74%\n- Third Class: Female survival rate is 50.00%, Male survival rate is 13.70%\n\nFemales had a higher survival rate than males in all passenger classes.'}

In [20]:
# Pandas verification
survival_by_gender_class = df.groupby(['Sex', 'Pclass']).agg(
    Total_Count=('Survived', 'count'),
    Survival_Rate=('Survived', 'mean')
).round(4)
survival_by_gender_class['Survival_Rate_Pct'] = (survival_by_gender_class['Survival_Rate'] * 100).round(2)
print("Pandas verification:")
display(survival_by_gender_class)

Pandas verification:


Unnamed: 0_level_0,Unnamed: 1_level_0,Total_Count,Survival_Rate,Survival_Rate_Pct
Sex,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,1,94,0.9681,96.81
female,2,76,0.9211,92.11
female,3,144,0.5,50.0
male,1,122,0.3689,36.89
male,2,108,0.1574,15.74
male,3,343,0.137,13.7


### **Complex Query 3: Age group survival analysis**

In [21]:
agent_executor.invoke({
    "input": """Categorize passengers into age groups: Children (0-12), Teens (13-19), Adults (20-59), Seniors (60+).
    Show the count and survival rate for each age group."""
})



[1m> Entering new SQL Agent Executor chain...[0m
[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`


[0m[38;5;200m[1;3mtitanic[0m[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`


[0m[38;5;200m[1;3mtitanic[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'titanic'}`


[0m[33;1m[1;3m
CREATE TABLE titanic (
	"Survived" BIGINT, 
	"Pclass" BIGINT, 
	"Name" TEXT, 
	"Sex" TEXT, 
	"Age" FLOAT, 
	"Siblings/Spouses Aboard" BIGINT, 
	"Parents/Children Aboard" BIGINT, 
	"Fare" FLOAT
)

/*
3 rows from titanic table:
Survived	Pclass	Name	Sex	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare
0	3	Mr. Owen Harris Braund	male	22.0	1	0	7.25
1	1	Mrs. John Bradley (Florence Briggs Thayer) Cumings	female	38.0	1	0	71.2833
1	3	Miss. Laina Heikkinen	female	26.0	0	0	7.925
*/[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'titanic'}`


[0m[33;1m[1;3m
CREATE TABLE titanic (
	"Survived" BIGINT, 
	"Pclass" BIGINT, 
	"Name" TEXT, 
	"Sex" TEXT, 
	"

{'input': 'Categorize passengers into age groups: Children (0-12), Teens (13-19), Adults (20-59), Seniors (60+).\n    Show the count and survival rate for each age group.',
 'output': 'Here is the categorization of passengers into age groups along with the count and survival rate for each group:\n\n- Children (0-12): 79 passengers, Survival Rate: 54.43%\n- Teens (13-19): 120 passengers, Survival Rate: 38.33%\n- Adults (20-59): 657 passengers, Survival Rate: 37.44%\n- Seniors (60+): 31 passengers, Survival Rate: 22.58%'}

In [22]:
# Pandas verification
def categorize_age(age):
    if pd.isna(age):
        return 'Unknown'
    elif age <= 12:
        return 'Children (0-12)'
    elif age <= 19:
        return 'Teens (13-19)'
    elif age <= 59:
        return 'Adults (20-59)'
    else:
        return 'Seniors (60+)'

df['AgeGroup'] = df['Age'].apply(categorize_age)
age_group_survival = df.groupby('AgeGroup').agg(
    Total_Count=('Survived', 'count'),
    Survivors=('Survived', 'sum'),
    Survival_Rate=('Survived', 'mean')
).round(4)
age_group_survival['Survival_Rate_Pct'] = (age_group_survival['Survival_Rate'] * 100).round(2)
age_group_survival = age_group_survival.sort_values('Total_Count', ascending=False)
print("Pandas verification:")
display(age_group_survival)

Pandas verification:


Unnamed: 0_level_0,Total_Count,Survivors,Survival_Rate,Survival_Rate_Pct
AgeGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adults (20-59),657,246,0.3744,37.44
Teens (13-19),120,46,0.3833,38.33
Children (0-12),79,43,0.5443,54.43
Seniors (60+),31,7,0.2258,22.58


### **Complex Query 4: Family size impact on survival**

In [None]:
agent_executor.invoke({
    "input": """Calculate family size ("Siblings/Spouses Aboard" + "Parents/Children Aboard" + 1) for each passenger and analyze survival rates.
    Group by family size and show the count and survival rate. Order by family size."""
})



[1m> Entering new SQL Agent Executor chain...[0m
[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`


[0m[38;5;200m[1;3mtitanic[0m[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`


[0m[38;5;200m[1;3mtitanic[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'titanic'}`


[0m[33;1m[1;3m
CREATE TABLE titanic (
	"Survived" BIGINT, 
	"Pclass" BIGINT, 
	"Name" TEXT, 
	"Sex" TEXT, 
	"Age" FLOAT, 
	"Siblings/Spouses Aboard" BIGINT, 
	"Parents/Children Aboard" BIGINT, 
	"Fare" FLOAT
)

/*
3 rows from titanic table:
Survived	Pclass	Name	Sex	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare
0	3	Mr. Owen Harris Braund	male	22.0	1	0	7.25
1	1	Mrs. John Bradley (Florence Briggs Thayer) Cumings	female	38.0	1	0	71.2833
1	3	Miss. Laina Heikkinen	female	26.0	0	0	7.925
*/[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'titanic'}`


[0m[33;1m[1;3m
CREATE TABLE titanic (
	"Survived" BIGINT, 
	"Pclass" BIGINT, 
	"Name" TEXT, 
	"Sex" TEXT, 
	"

{'input': 'Calculate family size (SibSp + Parch + 1) for each passenger and analyze survival rates.\n    Group by family size and show the count and survival rate. Order by family size.',
 'output': 'Here is the analysis of survival rates grouped by family size (calculated as SibSp + Parch + 1):\n\n- Family Size 1: Count = 533, Survival Rate = 30.58%\n- Family Size 2: Count = 161, Survival Rate = 55.28%\n- Family Size 3: Count = 102, Survival Rate = 57.84%\n- Family Size 4: Count = 29, Survival Rate = 72.41%\n- Family Size 5: Count = 15, Survival Rate = 20.00%\n- Family Size 6: Count = 22, Survival Rate = 13.64%\n- Family Size 7: Count = 12, Survival Rate = 33.33%\n- Family Size 8: Count = 6, Survival Rate = 0.00%\n- Family Size 11: Count = 7, Survival Rate = 0.00%\n\nThe highest survival rate is observed for family size 4, while very large families (8 and 11) had no survivors in this dataset.'}

In [26]:
# Pandas verification
# Use the correct column names from this dataset
if 'Siblings/Spouses Aboard' in df.columns and 'Parents/Children Aboard' in df.columns:
    df['FamilySize'] = df['Siblings/Spouses Aboard'] + df['Parents/Children Aboard'] + 1
    family_survival = df.groupby('FamilySize').agg(
        Total_Count=('Survived', 'count'),
        Survivors=('Survived', 'sum'),
        Survival_Rate=('Survived', 'mean')
    ).round(4)
    family_survival['Survival_Rate_Pct'] = (family_survival['Survival_Rate'] * 100).round(2)
    family_survival = family_survival.sort_index()
    print("Pandas verification:")
    display(family_survival)
else:
    print("Column check:")
    print("Available columns:", df.columns.tolist())
    print("\nNote: Required family columns not found in the dataset.")
    print("Cannot calculate family size without these columns.")

Pandas verification:


Unnamed: 0_level_0,Total_Count,Survivors,Survival_Rate,Survival_Rate_Pct
FamilySize,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,533,163,0.3058,30.58
2,161,89,0.5528,55.28
3,102,59,0.5784,57.84
4,29,21,0.7241,72.41
5,15,3,0.2,20.0
6,22,3,0.1364,13.64
7,12,4,0.3333,33.33
8,6,0,0.0,0.0
11,7,0,0.0,0.0


### **Complex Query 5: Fare analysis by class and survival**

In [27]:
agent_executor.invoke({
    "input": """For each passenger class, calculate the average, minimum, and maximum fare paid by survivors vs non-survivors.
    Show class, survival status, and fare statistics."""
})



[1m> Entering new SQL Agent Executor chain...[0m
[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`


[0m[38;5;200m[1;3mtitanic[0m[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`


[0m[38;5;200m[1;3mtitanic[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'titanic'}`


[0m[33;1m[1;3m
CREATE TABLE titanic (
	"Survived" BIGINT, 
	"Pclass" BIGINT, 
	"Name" TEXT, 
	"Sex" TEXT, 
	"Age" FLOAT, 
	"Siblings/Spouses Aboard" BIGINT, 
	"Parents/Children Aboard" BIGINT, 
	"Fare" FLOAT
)

/*
3 rows from titanic table:
Survived	Pclass	Name	Sex	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare
0	3	Mr. Owen Harris Braund	male	22.0	1	0	7.25
1	1	Mrs. John Bradley (Florence Briggs Thayer) Cumings	female	38.0	1	0	71.2833
1	3	Miss. Laina Heikkinen	female	26.0	0	0	7.925
*/[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'titanic'}`


[0m[33;1m[1;3m
CREATE TABLE titanic (
	"Survived" BIGINT, 
	"Pclass" BIGINT, 
	"Name" TEXT, 
	"Sex" TEXT, 
	"

{'input': 'For each passenger class, calculate the average, minimum, and maximum fare paid by survivors vs non-survivors.\n    Show class, survival status, and fare statistics.',
 'output': 'Here is the average, minimum, and maximum fare paid by survivors vs non-survivors for each passenger class:\n\nClass 1:\n- Non-survivors: Avg Fare = 64.68, Min Fare = 0.0, Max Fare = 263.0\n- Survivors: Avg Fare = 95.61, Min Fare = 25.93, Max Fare = 512.33\n\nClass 2:\n- Non-survivors: Avg Fare = 19.41, Min Fare = 0.0, Max Fare = 73.5\n- Survivors: Avg Fare = 22.06, Min Fare = 10.5, Max Fare = 65.0\n\nClass 3:\n- Non-survivors: Avg Fare = 13.71, Min Fare = 0.0, Max Fare = 69.55\n- Survivors: Avg Fare = 13.69, Min Fare = 0.0, Max Fare = 56.50\n\nLet me know if you need any more details.'}

In [28]:
# Pandas verification
fare_analysis = df.groupby(['Pclass', 'Survived']).agg(
    Avg_Fare=('Fare', 'mean'),
    Min_Fare=('Fare', 'min'),
    Max_Fare=('Fare', 'max'),
    Count=('Fare', 'count')
).round(2)
fare_analysis.index.names = ['Class', 'Survived']
print("Pandas verification:")
display(fare_analysis)

Pandas verification:


Unnamed: 0_level_0,Unnamed: 1_level_0,Avg_Fare,Min_Fare,Max_Fare,Count
Class,Survived,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,64.68,0.0,263.0,80
1,1,95.61,25.93,512.33,136
2,0,19.41,0.0,73.5,97
2,1,22.06,10.5,65.0,87
3,0,13.71,0.0,69.55,368
3,1,13.69,0.0,56.5,119


### **Complex Query 6: Multi-dimensional survival analysis**

In [29]:
agent_executor.invoke({
    "input": """Find the top 5 most expensive tickets (by Fare) and show passenger details including Name, Sex, Age, 
    Pclass, Fare, and whether they survived. Order by Fare descending."""
})



[1m> Entering new SQL Agent Executor chain...[0m
[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`


[0m[38;5;200m[1;3mtitanic[0m[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`


[0m[38;5;200m[1;3mtitanic[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'titanic'}`


[0m[33;1m[1;3m
CREATE TABLE titanic (
	"Survived" BIGINT, 
	"Pclass" BIGINT, 
	"Name" TEXT, 
	"Sex" TEXT, 
	"Age" FLOAT, 
	"Siblings/Spouses Aboard" BIGINT, 
	"Parents/Children Aboard" BIGINT, 
	"Fare" FLOAT
)

/*
3 rows from titanic table:
Survived	Pclass	Name	Sex	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare
0	3	Mr. Owen Harris Braund	male	22.0	1	0	7.25
1	1	Mrs. John Bradley (Florence Briggs Thayer) Cumings	female	38.0	1	0	71.2833
1	3	Miss. Laina Heikkinen	female	26.0	0	0	7.925
*/[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'titanic'}`


[0m[33;1m[1;3m
CREATE TABLE titanic (
	"Survived" BIGINT, 
	"Pclass" BIGINT, 
	"Name" TEXT, 
	"Sex" TEXT, 
	"

{'input': 'Find the top 5 most expensive tickets (by Fare) and show passenger details including Name, Sex, Age, \n    Pclass, Fare, and whether they survived. Order by Fare descending.',
 'output': 'The top 5 most expensive tickets by Fare and their passenger details are:\n\n1. Name: Miss. Anna Ward, Sex: female, Age: 35, Pclass: 1, Fare: 512.3292, Survived: Yes\n2. Name: Mr. Thomas Drake Martinez Cardeza, Sex: male, Age: 36, Pclass: 1, Fare: 512.3292, Survived: Yes\n3. Name: Mr. Gustave J Lesurer, Sex: male, Age: 35, Pclass: 1, Fare: 512.3292, Survived: Yes\n4. Name: Mr. Charles Alexander Fortune, Sex: male, Age: 19, Pclass: 1, Fare: 263.0, Survived: No\n5. Name: Miss. Mabel Helen Fortune, Sex: female, Age: 23, Pclass: 1, Fare: 263.0, Survived: Yes'}

In [30]:
# Pandas verification
top_5_expensive = df.nlargest(5, 'Fare')[['Name', 'Sex', 'Age', 'Pclass', 'Fare', 'Survived']]
print("Pandas verification - Top 5 most expensive tickets:")
display(top_5_expensive)

Pandas verification - Top 5 most expensive tickets:


Unnamed: 0,Name,Sex,Age,Pclass,Fare,Survived
257,Miss. Anna Ward,female,35.0,1,512.3292,1
676,Mr. Thomas Drake Martinez Cardeza,male,36.0,1,512.3292,1
733,Mr. Gustave J Lesurer,male,35.0,1,512.3292,1
27,Mr. Charles Alexander Fortune,male,19.0,1,263.0,0
87,Miss. Mabel Helen Fortune,female,23.0,1,263.0,1
