diff --git a/README.md b/README.md index 05e3935cf..69f3bc5ce 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,38 @@ pandas_ai( ![Chart](images/histogram-chart.png?raw=true) +Additionally, you can also pass in multiple dataframes to PandasAI and ask questions relating them. + +```python +import pandas as pd +from pandasai import PandasAI + +employees_data = { + 'EmployeeID': [1, 2, 3, 4, 5], + 'Name': ['John', 'Emma', 'Liam', 'Olivia', 'William'], + 'Department': ['HR', 'Sales', 'IT', 'Marketing', 'Finance'] +} + +salaries_data = { + 'EmployeeID': [1, 2, 3, 4, 5], + 'Salary': [5000, 6000, 4500, 7000, 5500] +} + +employees_df = pd.DataFrame(employees_data) +salaries_df = pd.DataFrame(salaries_data) + + +llm = OpenAI() +pandas_ai = PandasAI(llm) +pandas_ai([employees_df, salaries_df], "Who gets paid the most?") +``` + +The above code will return the following: + +``` +Oh, Olivia gets paid the most. +``` + You can find more examples in the [examples](examples) directory. ## Command-Line Tool @@ -103,7 +135,7 @@ Options: - **-d, --dataset**: The file path to the dataset. - **-t, --token**: Your HuggingFace or OpenAI API token, if no token provided pai will pull from the `.env` file. -- **-m, --model**: Choice of LLM, either `openai`, `open-assistant`, or `starcoder`. +- **-m, --model**: Choice of LLM, either `openai`, `open-assistant`, `starcoder`, or Google `palm`. - **-p, --prompt**: Prompt that PandasAI will run. To view a full list of available options and their descriptions, run the following command: diff --git a/docs/getting-started.md b/docs/getting-started.md index bd313d9c1..ce1188ba9 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -1,23 +1,22 @@ -Usage -===== -`pandasai` is developed on top of `pandas` api. The objective is to make dataframe conversation +# Usage + +`pandasai` is developed on top of `pandas` api. The objective is to make dataframe conversation using Large Language Models (LLMs). -Installation ------------- +## Installation -To use pandasai, first install it using pip through [PyPi](https://pypi.org/project/pandasai/) package distribution +To use pandasai, first install it using pip through [PyPi](https://pypi.org/project/pandasai/) package distribution framework. It is actively developed so be vigilant for versions updates. ```console pip install pandasai ``` ->It is recommended to create a Virtual environment using your preffred choice of Environment Managers e.g conda, ->Poetry etc +> It is recommended to create a Virtual environment using your preffred choice of Environment Managers e.g conda, +> Poetry etc + +## Getting Started -Getting Started ---------------- Below is simple example to get started with `pandasai`. ```python @@ -41,16 +40,15 @@ pandas_ai.run(df, prompt='Which are the 5 happiest countries?') ## Generate openai API Token -Users are required to generate `YOUR_API_TOKEN`. Follow below simple steps to generate your API_TOKEN with +Users are required to generate `YOUR_API_TOKEN`. Follow below simple steps to generate your API_TOKEN with [openai](https://platform.openai.com/overview). -1. Go to https://openai.com/api/ and signup with your email address or connect your Google Account. +1. Go to https://openai.com/api/ and signup with your email address or connect your Google Account. 2. Go to View API Keys on left side of your Personal Account Settings 3. Select Create new Secret key -> The API access to openai is a paid service. You have to set up billing. ->Read the [Pricing](https://platform.openai.com/docs/quickstart/pricing) information before experimenting. - +> The API access to openai is a paid service. You have to set up billing. +> Read the [Pricing](https://platform.openai.com/docs/quickstart/pricing) information before experimenting. ## Demo in Google Colab @@ -58,7 +56,8 @@ Try out PandasAI in your browser: [![Open in Colab](https://camo.githubusercontent.com/84f0493939e0c4de4e6dbe113251b4bfb5353e57134ffd9fcab6b8714514d4d1/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/drive/1rKz7TudOeCeKGHekw7JFNL4sagN9hon-?usp=sharing) -### Examples +### Examples + Other [examples](../examples) are included in the repository along with samples of data. #### Working with CSV @@ -81,7 +80,9 @@ print(response) ``` #### Working is Pandas Dataframe + Example of using PandasAI with a Pandas DataFrame + ```python import pandas as pd from data.sample_dataframe import dataframe @@ -99,8 +100,10 @@ print(response) ``` -#### Plotting +#### Plotting + Example of using PandasAI to generate a chart from a Pandas DataFrame + ```python import pandas as pd from data.sample_dataframe import dataframe @@ -118,4 +121,38 @@ response = pandas_ai.run( ) print(response) # Output: check out images/histogram-chart.png -``` \ No newline at end of file +``` + +### Working with multiple dataframes + +Example of using PandasAI with multiple Pandas DataFrames + +```python +import pandas as pd + +from pandasai import PandasAI +from pandasai.llm.openai import OpenAI + +employees_data = { + 'EmployeeID': [1, 2, 3, 4, 5], + 'Name': ['John', 'Emma', 'Liam', 'Olivia', 'William'], + 'Department': ['HR', 'Sales', 'IT', 'Marketing', 'Finance'] +} + +salaries_data = { + 'EmployeeID': [1, 2, 3, 4, 5], + 'Salary': [5000, 6000, 4500, 7000, 5500] +} + +employees_df = pd.DataFrame(employees_data) +salaries_df = pd.DataFrame(salaries_data) + +llm = OpenAI() +pandas_ai = PandasAI(llm, verbose=True, conversational=False) +response = pandas_ai.run( + [employees_df, salaries_df], + "Who gets paid the most?", +) +print(response) +# Output: Olivia gets paid the most. +``` diff --git a/examples/with_multiple_dataframes.py b/examples/with_multiple_dataframes.py new file mode 100644 index 000000000..6b4653682 --- /dev/null +++ b/examples/with_multiple_dataframes.py @@ -0,0 +1,26 @@ +"""Example of using PandasAI on multiple Pandas DataFrame""" + +import pandas as pd +from pandasai import PandasAI +from pandasai.llm.openai import OpenAI + +employees_data = { + 'EmployeeID': [1, 2, 3, 4, 5], + 'Name': ['John', 'Emma', 'Liam', 'Olivia', 'William'], + 'Department': ['HR', 'Sales', 'IT', 'Marketing', 'Finance'] +} + +salaries_data = { + 'EmployeeID': [1, 2, 3, 4, 5], + 'Salary': [5000, 6000, 4500, 7000, 5500] +} + +employees_df = pd.DataFrame(employees_data) +salaries_df = pd.DataFrame(salaries_data) + + +llm = OpenAI() +pandas_ai = PandasAI(llm, verbose=True) +response = pandas_ai([employees_df, salaries_df], "Who gets paid the most?") +print(response) +# Output: Olivia diff --git a/pandasai/__init__.py b/pandasai/__init__.py index b6c09da4e..dbe399931 100644 --- a/pandasai/__init__.py +++ b/pandasai/__init__.py @@ -17,7 +17,8 @@ from .prompts.correct_error_prompt import CorrectErrorPrompt from .prompts.generate_python_code import GeneratePythonCodePrompt from .prompts.generate_response import GenerateResponsePrompt - +from .prompts.multiple_dataframes import MultipleDataframesPrompt +from .prompts.correct_multiples_prompt import CorrectMultipleDataframesErrorPrompt # pylint: disable=too-many-instance-attributes disable=too-many-arguments class PandasAI: @@ -85,27 +86,51 @@ def run( try: rows_to_display = 0 if self._enforce_privacy else 5 - df_head = data_frame.head(rows_to_display) - if anonymize_df: - df_head = anonymize_dataframe_head(df_head) - - code = self._llm.generate_code( - GeneratePythonCodePrompt( - prompt=prompt, - df_head=df_head, - num_rows=data_frame.shape[0], - num_columns=data_frame.shape[1], - rows_to_display=rows_to_display, - ), - prompt, - ) - self._original_instructions = { - "question": prompt, - "df_head": df_head, - "num_rows": data_frame.shape[0], - "num_columns": data_frame.shape[1], - "rows_to_display": rows_to_display, - } + multiple: bool = isinstance(data_frame, list) + + if multiple: + + heads = [anonymize_dataframe_head(dataframe) + if anonymize_df + else dataframe.head(rows_to_display) + for dataframe in data_frame] + + code = self._llm.generate_code( + MultipleDataframesPrompt(dataframes=heads), + prompt, + ) + + self._original_instructions = { + "question": prompt, + "df_head": heads, + "rows_to_display": rows_to_display, + } + + else: + + df_head = data_frame.head(rows_to_display) + if anonymize_df: + df_head = anonymize_dataframe_head(df_head) + + code = self._llm.generate_code( + GeneratePythonCodePrompt( + prompt=prompt, + df_head=df_head, + num_rows=data_frame.shape[0], + num_columns=data_frame.shape[1], + rows_to_display=rows_to_display, + ), + prompt, + ) + + self._original_instructions = { + "question": prompt, + "df_head": df_head, + "num_rows": data_frame.shape[0], + "num_columns": data_frame.shape[1], + "rows_to_display": rows_to_display, + } + self.last_code_generated = code self.log( f""" @@ -171,7 +196,7 @@ def is_df_overwrite(self, node: ast.stmt) -> str: return ( isinstance(node, ast.Assign) and isinstance(node.targets[0], ast.Name) - and node.targets[0].id == "df" + and re.match(r"df\d{0,2}$", node.targets[0].id) ) def clean_code(self, code: str) -> str: @@ -197,6 +222,7 @@ def run_code( # pylint: disable=W0122 disable=W0123 disable=W0702:bare-except """Run the code in the current context and return the result""" + multiple: bool = isinstance(data_frame, list) # Get the code to run removing unsafe imports and df overwrites code_to_run = self.clean_code(code) self.last_run_code = code_to_run @@ -208,26 +234,32 @@ def run_code( ```""" ) + environment: dict = { + "pd": pd, + "plt": plt, + "__builtins__": { + **{ + builtin: __builtins__[builtin] + for builtin in WHITELISTED_BUILTINS + }, + }, + } + + if multiple: + environment.update({ + f"df{i}": dataframe for i, dataframe in enumerate(data_frame, start = 1) + }) + + else: + environment["df"] = data_frame + # Redirect standard output to a StringIO buffer with redirect_stdout(io.StringIO()) as output: count = 0 while count < self._max_retries: try: # Execute the code - exec( - code_to_run, - { - "pd": pd, - "df": data_frame, - "plt": plt, - "__builtins__": { - **{ - builtin: __builtins__[builtin] - for builtin in WHITELISTED_BUILTINS - }, - }, - }, - ) + exec(code_to_run, environment) code = code_to_run break except Exception as e: # pylint: disable=W0718 disable=C0103 @@ -235,15 +267,26 @@ def run_code( raise e count += 1 - error_correcting_instruction = CorrectErrorPrompt( - code=code, - error_returned=e, - question=self._original_instructions["question"], - df_head=self._original_instructions["df_head"], - num_rows=self._original_instructions["num_rows"], - num_columns=self._original_instructions["num_columns"], - rows_to_display=self._original_instructions["rows_to_display"], - ) + + if multiple: + error_correcting_instruction = CorrectMultipleDataframesErrorPrompt( + code=code, + error_returned=e, + question=self._original_instructions["question"], + df_head=self._original_instructions["df_head"], + ) + + else: + error_correcting_instruction = CorrectErrorPrompt( + code=code, + error_returned=e, + question=self._original_instructions["question"], + df_head=self._original_instructions["df_head"], + num_rows=self._original_instructions["num_rows"], + num_columns=self._original_instructions["num_columns"], + rows_to_display=self._original_instructions["rows_to_display"], + ) + code_to_run = self._llm.generate_code( error_correcting_instruction, "" ) @@ -254,25 +297,12 @@ def run_code( lines = code.strip().split("\n") last_line = lines[-1].strip() - pattern = r"^print\((.*)\)$" - match = re.match(pattern, last_line) + match = re.match(r"^print\((.*)\)$", last_line) if match: last_line = match.group(1) try: - return eval( - last_line, - { - "pd": pd, - "df": data_frame, - "__builtins__": { - **{ - builtin: __builtins__[builtin] - for builtin in WHITELISTED_BUILTINS - }, - }, - }, - ) + return eval(last_line, environment) except Exception: # pylint: disable=W0718 return captured_output diff --git a/pandasai/prompts/correct_multiples_prompt.py b/pandasai/prompts/correct_multiples_prompt.py new file mode 100644 index 000000000..fe12ba3d4 --- /dev/null +++ b/pandasai/prompts/correct_multiples_prompt.py @@ -0,0 +1,47 @@ +""" Prompt to correct error """ + +import pandas as pd + +from pandasai.constants import END_CODE_TAG, START_CODE_TAG + +from .base import Prompt + + +class CorrectMultipleDataframesErrorPrompt(Prompt): + """Prompt to generate Python code""" + + text: str = """ +You are provided with the following pandas dataframes:""" + + def __init__( + self, + code: str, + error_returned: Exception, + question: str, + df_head: list[pd.DataFrame], + ): + for i, dataframe in enumerate(df_head, start=1): + row, col = dataframe.shape + self.text += f""" +Dataframe df{i}, with {row} rows and {col} columns. +This is the result of `print(df{i}.head())`: +{dataframe}""" + + instruction: str = f""" +The user asked the following question: +{question} + +You generated this python code: +{code} + +It fails with the following error: +{error_returned} + +Correct the python code and return a new python code (do not import anything) that fixes the above mentioned error. Do not generate the same code again. +Make sure to prefix the requested python code with {START_CODE_TAG} exactly and suffix the code with {END_CODE_TAG} exactly. +""" + + self.text += instruction + + def __str__(self): + return self.text diff --git a/pandasai/prompts/multiple_dataframes.py b/pandasai/prompts/multiple_dataframes.py new file mode 100644 index 000000000..2a357a6a2 --- /dev/null +++ b/pandasai/prompts/multiple_dataframes.py @@ -0,0 +1,40 @@ +""" Prompt to generate Python code for multiple dataframes """ + +from datetime import date + +import pandas as pd + +from pandasai.constants import END_CODE_TAG, START_CODE_TAG + +from .base import Prompt + + +class MultipleDataframesPrompt(Prompt): + """Prompt to generate Python code""" + + text: str = """ +Today is {today_date}. +You are provided with the following pandas dataframes:""" + instruction: str = """ +When asked about the data, your response should include a python code that describes the dataframes provided. +Using the provided dataframes and no other dataframes, return the python code and make sure to prefix the requested python code with {START_CODE_TAG} exactly and suffix the code with {END_CODE_TAG} exactly to get the answer to the following question: +""" + + def __init__(self, dataframes: list[pd.DataFrame]): + for i, dataframe in enumerate(dataframes, start=1): + row, col = dataframe.shape + + self.text += f""" +Dataframe df{i}, with {row} rows and {col} columns. +This is the result of `print(df{i}.head())`: +{dataframe}""" + + self.text += self.instruction + self.text = self.text.format( + today_date=date.today(), + START_CODE_TAG=START_CODE_TAG, + END_CODE_TAG=END_CODE_TAG, + ) + + def __str__(self): + return self.text diff --git a/tests/prompts/test_correct_multiples_prompt.py b/tests/prompts/test_correct_multiples_prompt.py new file mode 100644 index 000000000..20a87e893 --- /dev/null +++ b/tests/prompts/test_correct_multiples_prompt.py @@ -0,0 +1,65 @@ +"""Unit tests for the correct multiples prompt class""" + +from datetime import date + +import pandas as pd +import pytest + +from pandasai.prompts.correct_multiples_prompt import ( + CorrectMultipleDataframesErrorPrompt, +) + + +class TestCorrectMultipleDataframesErrorPrompt: + """Unit tests for the correct multiples prompt class""" + + def test_str_with_args(self): + """Test that the __str__ method is implemented""" + + df1 = pd.DataFrame( + {"A": [10, 20, 30, 40, 50], "B": [1, 2, 3, 4, 5], "C": [2, 3, 4, 5, 6]} + ) + df2 = pd.DataFrame( + {"A": [10, 20, 30, 40, 50], "B": [1, 2, 3, 4, 5], "C": [2, 3, 4, 5, 6]} + ) + + assert ( + str( + CorrectMultipleDataframesErrorPrompt( + code="code", + error_returned=Exception("error"), + question="question", + df_head=[df1, df2], + ) + ) + == """ +You are provided with the following pandas dataframes: +Dataframe df1, with 5 rows and 3 columns. +This is the result of `print(df1.head())`: + A B C +0 10 1 2 +1 20 2 3 +2 30 3 4 +3 40 4 5 +4 50 5 6 +Dataframe df2, with 5 rows and 3 columns. +This is the result of `print(df2.head())`: + A B C +0 10 1 2 +1 20 2 3 +2 30 3 4 +3 40 4 5 +4 50 5 6 +The user asked the following question: +question + +You generated this python code: +code + +It fails with the following error: +error + +Correct the python code and return a new python code (do not import anything) that fixes the above mentioned error. Do not generate the same code again. +Make sure to prefix the requested python code with exactly and suffix the code with exactly. +""" + ) diff --git a/tests/prompts/test_generate_python_code_prompt.py b/tests/prompts/test_generate_python_code_prompt.py new file mode 100644 index 000000000..72c5e3906 --- /dev/null +++ b/tests/prompts/test_generate_python_code_prompt.py @@ -0,0 +1,33 @@ +"""Unit tests for the generate python code prompt class""" + +from datetime import date + +import pytest + +from pandasai.prompts.generate_python_code import GeneratePythonCodePrompt + + +class TestGeneratePythonCodePrompt: + """Unit tests for the generate python code prompt class""" + + def test_str_with_args(self): + """Test that the __str__ method is implemented""" + assert ( + str( + GeneratePythonCodePrompt( + df_head="df.head()", + num_rows=10, + num_columns=5, + rows_to_display=5, + ) + ) + == f""" +Today is {date.today()}. +You are provided with a pandas dataframe (df) with 10 rows and 5 columns. +This is the result of `print(df.head(5))`: +df.head(). + +When asked about the data, your response should include a python code that describes the dataframe `df`. +Using the provided dataframe, df, return the python code and make sure to prefix the requested python code with exactly and suffix the code with exactly to get the answer to the following question: +""" + ) diff --git a/tests/prompts/test_multiple_dataframes_prompt.py b/tests/prompts/test_multiple_dataframes_prompt.py new file mode 100644 index 000000000..897b47f7c --- /dev/null +++ b/tests/prompts/test_multiple_dataframes_prompt.py @@ -0,0 +1,48 @@ +"""Unit tests for the multiple dataframes prompt class""" + +from datetime import date + +import pandas as pd +import pytest + +from pandasai.prompts.multiple_dataframes import MultipleDataframesPrompt + + +class TestMultipleDataframesPrompt: + """Unit tests for the multiple dataframes prompt class""" + + def test_str_with_args(self): + """Test that the __str__ method is implemented""" + + df1 = pd.DataFrame( + {"A": [10, 20, 30, 40, 50], "B": [1, 2, 3, 4, 5], "C": [2, 3, 4, 5, 6]} + ) + df2 = pd.DataFrame( + {"A": [10, 20, 30, 40, 50], "B": [1, 2, 3, 4, 5], "C": [2, 3, 4, 5, 6]} + ) + + assert ( + str(MultipleDataframesPrompt(dataframes=[df1, df2])) + == f""" +Today is {date.today()}. +You are provided with the following pandas dataframes: +Dataframe df1, with 5 rows and 3 columns. +This is the result of `print(df1.head())`: + A B C +0 10 1 2 +1 20 2 3 +2 30 3 4 +3 40 4 5 +4 50 5 6 +Dataframe df2, with 5 rows and 3 columns. +This is the result of `print(df2.head())`: + A B C +0 10 1 2 +1 20 2 3 +2 30 3 4 +3 40 4 5 +4 50 5 6 +When asked about the data, your response should include a python code that describes the dataframes provided. +Using the provided dataframes and no other dataframes, return the python code and make sure to prefix the requested python code with exactly and suffix the code with exactly to get the answer to the following question: +""" + )