In [104]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import hackathon.prompts as prompts
from dotenv import load_dotenv
from pandasai import SmartDataframe
from pandasai.llm import OpenAI
import os


# For the agents
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
import langchain.llms as llms
import time
import json
from flask import Flask, request, jsonify

#For the main OPENAI ASSISTANT
import openai as openai
#
load_dotenv()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


True

In [8]:
# Constants
DATA_PATH = "/Users/arthur.cruiziat/dev/Qonto_hackathon/data/random_trx_df.csv"
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# Read data

In [117]:
data = pd.read_csv(DATA_PATH)
data_filterered = data.copy()

In [173]:
def prompt_to_filter(prompt, data_to_filter):
    """
    Applies a given prompt to filter a DataFrame using OpenAI's GPT model.

    Args:
    - prompt (str): A prompt describing the filter criteria for the DataFrame.
    - data (DataFrame): The DataFrame to be filtered.
    - OPENAI_API_KEY (str): The API key for OpenAI.

    Returns:
    - DataFrame: The filtered DataFrame based on the response from the OpenAI model.

    Raises:
    - ValueError: If an invalid API key is provided.
    - RuntimeError: If the maximum number of retries is reached.

    This function attempts up to 5 retries in case of transient errors.
    """
    OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

    retry_count = 0
    max_retries = 5
    wait_seconds = 2  # Time to wait between retries

    while retry_count < max_retries:
        try:
            llm = OpenAI(OPENAI_API_KEY)
            df = SmartDataframe(
                data_to_filter, config={"llm": llm, "llm_model": "gpt-4-1106-preview"}
            )
            response = df.chat(prompt).convert_dtypes()
            data_filterered = response
            return response
        except Exception as e:
            retry_count += 1
            if retry_count >= max_retries:
                raise RuntimeError(
                    f"Maximum retry attempts reached. Last error: {str(e)}"
                )
            time.sleep(wait_seconds)

In [74]:
data.head(2)

Unnamed: 0,user,date,amount,trx_category,method,team,receipt,VAT,Status
0,user_3,2020-11-19,7,Restaurants,taxes,Marketing,Not needed,Missing,Executed
1,user_2,2021-11-10,977,Health,direct_debit,Sales,Missing,Filled in,Declined


In [118]:
test = prompt_to_filter(
    "filter the dataframe to only keep the rows where the amount is greater than 900"
)

In [124]:
type(test.convert_dtypes())

pandas.core.frame.DataFrame

In [66]:
def dataframe_insights(prompt):
    """
    Analyzes a given DataFrame with a specific prompt using OpenAI's GPT model.

    Args:
    - prompt (str): A prompt describing the analysis or question for the DataFrame.
    - data (DataFrame): The DataFrame to analyze.
    - OPENAI_API_KEY (str): The API key for OpenAI.

    Returns:
    - str: The response from the OpenAI model.

    Raises:
    - ValueError: If an invalid API key is provided.
    - RuntimeError: If the maximum number of retries is reached.

    This function attempts up to 5 retries in case of transient errors.
    """
    DATA_PATH = "/Users/arthur.cruiziat/dev/Qonto_hackathon/data/random_trx_df.csv"
    data = pd.read_csv(DATA_PATH)
    OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
    retry_count = 0
    max_retries = 5
    wait_seconds = 2  # Time to wait between retries

    while retry_count < max_retries:
        try:
            agent = create_pandas_dataframe_agent(
                ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613"),
                data,
                verbose=True,
                agent_type=AgentType.OPENAI_FUNCTIONS,
                handle_parsing_errors=True,
            )
            response = agent.run(prompt)
            return response
        except Exception as e:
            retry_count += 1
            if retry_count == max_retries:
                raise RuntimeError(
                    f"Maximum retry attempts reached. Last error: {str(e)}"
                )
            time.sleep(wait_seconds)

In [67]:
data.head(2)

Unnamed: 0,user,date,amount,trx_category,method,team,receipt,VAT,Status
0,user_3,2020-11-19,7,Restaurants,taxes,Marketing,Not needed,Missing,Executed
1,user_2,2021-11-10,977,Health,direct_debit,Sales,Missing,Filled in,Declined


In [68]:
dataframe_insights("show me the 3 team with the most amounts for the user 3")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "df_user3 = df[df['user'] == 'user_3']\ntop_3_teams = df_user3.groupby('team')['amount'].sum().nlargest(3)\ntop_3_teams"}`


[0m[36;1m[1;3mteam
Finance      39462
Marketing    33354
IT           32566
Name: amount, dtype: int64[0m[32;1m[1;3mThe top 3 teams with the most amounts for user 3 are:
1. Finance with a total amount of 39,462
2. Marketing with a total amount of 33,354
3. IT with a total amount of 32,566[0m

[1m> Finished chain.[0m


'The top 3 teams with the most amounts for user 3 are:\n1. Finance with a total amount of 39,462\n2. Marketing with a total amount of 33,354\n3. IT with a total amount of 32,566'

In [45]:
data.head(2)

Unnamed: 0,user,date,amount,trx_category,method,team,receipt,VAT,Status
0,user_3,2020-11-19,7,Restaurants,taxes,Marketing,Not needed,Missing,Executed
1,user_2,2021-11-10,977,Health,direct_debit,Sales,Missing,Filled in,Declined


# Main LLLM

In [81]:
import openai as openai

In [151]:
# Create or load assistant
def create_assistant(client):
    assistant_file_path = "assistant.json"

    # Load existing assistant if file exists
    if os.path.exists(assistant_file_path):
        with open(assistant_file_path, "r") as file:
            assistant_data = json.load(file)
            assistant_id = assistant_data["assistant_id"]
            print("Loaded existing assistant ID.")
    else:
        # Create a new assistant if file does not exist

        assistant = client.beta.assistants.create(
            instructions=prompts.assistant_instructions,
            model="gpt-4-1106-preview",
            tools=[
                # {"type": "retrieval"},
                {
                    "type": "function",
                    "function": {
                        "name": "prompt_to_filter",
                        "description": "Applies a given prompt to filter a DataFrame using OpenAI's GPT model.",
                        "parameters": {
                            "type": "object",
                            "properties": {
                                "prompt": {
                                    "type": "string",
                                    "description": "A prompt describing the filter criteria for the DataFrame.",
                                },
                                "data_filtered": {
                                    "type": "pandas.DataFrame",
                                    "description": "The transaction dataframe to be filtered",
                                },
                            },
                            "required": ["prompt"],
                        },
                    },
                },
                {
                    "type": "function",
                    "function": {
                        "name": "dataframe_insights",
                        "description": "Analyzes a given DataFrame with a specific prompt using OpenAI's GPT model.",
                        "parameters": {
                            "type": "object",
                            "properties": {
                                "prompt": {
                                    "type": "string",
                                    "description": "A prompt describing the analysis or question for the DataFrame.",
                                },
                            },
                            "required": ["prompt"],
                        },
                    },
                },
            ],
            # file_ids=[file.id],
        )

        # Save assistant ID to file for future runs
        with open(assistant_file_path, "w") as file:
            json.dump({"assistant_id": assistant.id}, file)
            print("Created a new assistant and saved the ID.")

        assistant_id = assistant.id

    return assistant_id

In [84]:
client = openai.OpenAI(api_key=OPENAI_API_KEY)

In [100]:
assistant_id = create_assistant(client)

Created a new assistant and saved the ID.


In [111]:
def start_conversation():
    print("Starting a new conversation...")
    thread = client.beta.threads.create()
    print(f"New thread created with ID: {thread.id}")
    return {"thread_id": thread.id}

In [170]:
def chat(thread_id, user_input, data_filterered):
    if not thread_id:
        print("Error: Missing thread_id")
        return jsonify({"error": "Missing thread_id"}), 400

    print(f"Received message: {user_input} for thread ID: {thread_id}")

    # Add the user's message to the thread
    client.beta.threads.messages.create(
        thread_id=thread_id, role="user", content=user_input
    )

    # Run the Assistant
    run = client.beta.threads.runs.create(
        thread_id=thread_id, assistant_id=assistant_id
    )

    # Check if the Run requires action (function call)
    while True:
        run_status = client.beta.threads.runs.retrieve(
            thread_id=thread_id, run_id=run.id
        )
        # print(f"Run status: {run_status.status}")
        if run_status.status == "completed":
            break
        elif run_status.status == "requires_action":
            # Handle the function call
            for tool_call in run_status.required_action.submit_tool_outputs.tool_calls:
                if tool_call.function.name == "dataframe_insights":
                    # Process solar panel calculations
                    arguments = json.loads(tool_call.function.arguments)
                    output = dataframe_insights(arguments["prompt"])
                    client.beta.threads.runs.submit_tool_outputs(
                        thread_id=thread_id,
                        run_id=run.id,
                        tool_outputs=[
                            {"tool_call_id": tool_call.id, "output": json.dumps(output)}
                        ],
                    )
                elif tool_call.function.name == "prompt_to_filter":
                    # Process lead creation
                    arguments = json.loads(tool_call.function.arguments)
                    filtered_dataframe = prompt_to_filter(
                        arguments["prompt"], data_filterered
                    )
                    print(filtered_dataframe)
                    client.beta.threads.runs.submit_tool_outputs(
                        thread_id=thread_id,
                        run_id=run.id,
                        tool_outputs=[
                            {
                                "tool_call_id": tool_call.id,
                                "output": json.dumps(filtered_dataframe.to_json()),
                            }
                        ],
                    )
            time.sleep(1)  # Wait for a second before checking again

    # Retrieve and return the latest message from the assistant
    messages = client.beta.threads.messages.list(thread_id=thread_id)
    response = messages.data[0].content[0].text.value

    print(f"Assistant response: {response}")
    return {"response": response, "data_filterered": filtered_dataframe}

In [174]:
thread = start_conversation()
thread["thread_id"]

Starting a new conversation...
New thread created with ID: thread_3ROE0kjAGABFuOVMzLdL0N7d


'thread_3ROE0kjAGABFuOVMzLdL0N7d'

In [175]:
response = chat(
    thread["thread_id"],
    "Show me the transactions with only the user 3",
    data_filterered,
)

Received message: Show me the transactions with only the user 3 for thread ID: thread_3ROE0kjAGABFuOVMzLdL0N7d
       user        date  amount trx_category        method       team  \
0    user_3  2020-11-19       7  Restaurants         taxes  Marketing   
3    user_3  2020-11-10      57       Mobile  direct_debit    Finance   
6    user_3  2021-07-30     302        Sport  direct_debit         IT   
11   user_3  2020-07-15     431        Hotel         taxes      Sales   
12   user_3  2021-03-30     106         Fees      transfer      Sales   
..      ...         ...     ...          ...           ...        ...   
978  user_3  2020-07-26     538        Hotel      transfer      Sales   
983  user_3  2020-09-03     207       Health  direct_debit      Sales   
985  user_3  2020-03-22     902         Fees      transfer    Finance   
997  user_3  2020-12-30     401        Train          card    Finance   
998  user_3  2020-06-07     850          ATM  direct_debit    Finance   

        rece

In [177]:
response["data_filterered"]

Unnamed: 0,user,date,amount,trx_category,method,team,receipt,VAT,Status
0,user_3,2020-11-19,7,Restaurants,taxes,Marketing,Not needed,Missing,Executed
3,user_3,2020-11-10,57,Mobile,direct_debit,Finance,Not needed,Missing,Executed
6,user_3,2021-07-30,302,Sport,direct_debit,IT,Missing,Filled in,Processing
11,user_3,2020-07-15,431,Hotel,taxes,Sales,Missing,Missing,Declined
12,user_3,2021-03-30,106,Fees,transfer,Sales,Missing,Missing,Declined
...,...,...,...,...,...,...,...,...,...
978,user_3,2020-07-26,538,Hotel,transfer,Sales,Attached,Missing,Declined
983,user_3,2020-09-03,207,Health,direct_debit,Sales,Missing,Missing,Processing
985,user_3,2020-03-22,902,Fees,transfer,Finance,Attached,Filled in,Processing
997,user_3,2020-12-30,401,Train,card,Finance,Missing,Missing,Executed


In [164]:
data_filterered

Unnamed: 0,user,date,amount,trx_category,method,team,receipt,VAT,Status
0,user_3,2020-11-19,7,Restaurants,taxes,Marketing,Not needed,Missing,Executed
1,user_2,2021-11-10,977,Health,direct_debit,Sales,Missing,Filled in,Declined
2,user_2,2021-03-04,130,Travel,card,Marketing,Not needed,Missing,Executed
3,user_3,2020-11-10,57,Mobile,direct_debit,Finance,Not needed,Missing,Executed
4,user_1,2020-01-20,838,Travel,card,Finance,Attached,Missing,Processing
...,...,...,...,...,...,...,...,...,...
995,user_1,2020-09-20,94,Food & Grocery,transfer,Marketing,Missing,Missing,Processing
996,user_2,2020-11-04,931,Taxi,transfer,Finance,Not needed,Filled in,Executed
997,user_3,2020-12-30,401,Train,card,Finance,Missing,Missing,Executed
998,user_3,2020-06-07,850,ATM,direct_debit,Finance,Missing,Filled in,Executed
