In [None]:
!pip install llama-index

In [None]:
!pip install llama-index-llms-anthropic

In [3]:
import logging
import sys
from IPython.display import Markdown, display

import pandas as pd
from llama_index.core.query_engine import PandasQueryEngine


logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
ANTHROPIC_API_KEY="API KEY"

In [5]:
from llama_index.llms.anthropic import Anthropic

In [6]:
llm = Anthropic(temperature=0.0, model='claude-3-opus-20240229',api_key=ANTHROPIC_API_KEY)

In [7]:
!wget 'https://raw.githubusercontent.com/jerryjliu/llama_index/main/docs/examples/data/csv/titanic_train.csv' -O 'titanic_train.csv'

--2024-03-15 18:18:52--  https://raw.githubusercontent.com/jerryjliu/llama_index/main/docs/examples/data/csv/titanic_train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 57726 (56K) [text/plain]
Saving to: ‘titanic_train.csv’


2024-03-15 18:18:52 (5.22 MB/s) - ‘titanic_train.csv’ saved [57726/57726]



In [8]:
df = pd.read_csv("./titanic_train.csv")

In [9]:
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
df.shape

(891, 11)

In [21]:
df['sex'].value_counts()

male      577
female    314
Name: sex, dtype: int64

In [17]:
df.isnull().sum()

survived      0
pclass        0
name          0
sex           0
age         177
sibsp         0
parch         0
ticket        0
fare          0
cabin       687
embarked      2
dtype: int64

In [11]:
query_engine = PandasQueryEngine(df=df,llm=llm,verbose=True)

In [12]:
response = query_engine.query(
    "What is the correlation between survival and age?",
)

> Pandas Instructions:
```
df['survived'].corr(df['age'])
```
> Pandas Output: -0.07722109457217768


In [13]:
display(Markdown(f"<b>{response}</b>"))

<b>-0.07722109457217768</b>

In [14]:
response = query_engine.query(
    "How many records are there in the dataset?",
)

> Pandas Instructions:
```
len(df)
```
> Pandas Output: 891


In [15]:
display(Markdown(f"<b>{response}</b>"))

<b>891</b>

In [18]:
response = query_engine.query(
    "calculate the null values in each feature in the dataset?",
)

> Pandas Instructions:
```
df.isnull().sum()
```
> Pandas Output: survived      0
pclass        0
name          0
sex           0
age         177
sibsp         0
parch         0
ticket        0
fare          0
cabin       687
embarked      2
dtype: int64


In [22]:
response = query_engine.query(
    "How many male and female are there in the dataset?",
)

> Pandas Instructions:
```
df.groupby('sex').size()
```
> Pandas Output: sex
female    314
male      577
dtype: int64
