In [266]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import hackathon.prompts as prompts
from dotenv import load_dotenv
from pandasai import SmartDataframe
from pandasai.llm import OpenAI
import os
import math


# For the agents
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
import langchain.llms as llms
import time
import json
from flask import Flask, request, jsonify

#For the main OPENAI ASSISTANT
import openai as openai
#
load_dotenv()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


True

In [8]:
# Constants
DATA_PATH = "/Users/arthur.cruiziat/dev/Qonto_hackathon/data/random_trx_df.csv"
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# Read data

In [117]:
data = pd.read_csv(DATA_PATH)
data_filterered = data.copy()

In [267]:
def create_temporal_features(df, date_column):
    """
    Adds temporal features to the DataFrame.

    Parameters:
    df (DataFrame): The input DataFrame with a date or datetime column.
    date_column (str): The name of the column containing the date or datetime.

    Returns:
    DataFrame: The original DataFrame with additional columns for temporal features.
    """

    # Ensure the date column is in datetime format
    df[date_column] = pd.to_datetime(df[date_column])

    # Day of the week (0 = Monday, 6 = Sunday)
    df["day_of_week"] = df[date_column].dt.dayofweek

    # Day of the year
    df["day_of_year"] = df[date_column].dt.dayofyear

    # Week of the year
    df["week_of_year"] = df[date_column].dt.isocalendar().week

    # Week of the month
    df["week_of_month"] = df[date_column].apply(lambda x: math.ceil(x.day / 7.0))

    # Weekday (1 if it's a weekday, 0 if it's a weekend)
    df["is_weekday"] = df[date_column].dt.weekday < 5

    # Month
    df["month"] = df[date_column].dt.month

    # Year
    df["year"] = df[date_column].dt.year

    return df

In [268]:
test = data.copy()
test = create_temporal_features(test, "date")

In [269]:
test.head()

Unnamed: 0,user,date,amount,trx_category,method,team,receipt,VAT,Status,day_of_week,day_of_year,week_of_year,week_of_month,is_weekday,month,year
0,user_3,2020-11-19,7,Restaurants,taxes,Marketing,Not needed,Missing,Executed,3,324,47,3,True,11,2020
1,user_2,2021-11-10,977,Health,direct_debit,Sales,Missing,Filled in,Declined,2,314,45,2,True,11,2021
2,user_2,2021-03-04,130,Travel,card,Marketing,Not needed,Missing,Executed,3,63,9,1,True,3,2021
3,user_3,2020-11-10,57,Mobile,direct_debit,Finance,Not needed,Missing,Executed,1,315,46,2,True,11,2020
4,user_1,2020-01-20,838,Travel,card,Finance,Attached,Missing,Processing,0,20,4,3,True,1,2020


In [246]:
def prompt_to_filter(prompt, data_to_filter):
    """
    Applies a given prompt to filter a DataFrame using OpenAI's GPT model.

    Args:
    - prompt (str): A prompt describing the filter criteria for the DataFrame.
    - data (DataFrame): The DataFrame to be filtered.
    - OPENAI_API_KEY (str): The API key for OpenAI.

    Returns:
    - DataFrame: The filtered DataFrame based on the response from the OpenAI model.

    Raises:
    - ValueError: If an invalid API key is provided.
    - RuntimeError: If the maximum number of retries is reached.

    This function attempts up to 5 retries in case of transient errors.
    """
    OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

    retry_count = 0
    max_retries = 5
    wait_seconds = 2  # Time to wait between retries

    data = create_temporal_features(data_to_filter, "date")
    while retry_count < max_retries:
        try:
            llm = OpenAI(OPENAI_API_KEY)
            df = SmartDataframe(
                data, config={"llm": llm, "llm_model": "gpt-4-1106-preview"}
            )
            response = df.chat(prompt).convert_dtypes()
            return response
        except Exception as e:
            retry_count += 1
            if retry_count >= max_retries:
                return "The function did not work, please let the user know that we are not able to filter the transactions."
            time.sleep(wait_seconds)

In [74]:
data.head(2)

Unnamed: 0,user,date,amount,trx_category,method,team,receipt,VAT,Status
0,user_3,2020-11-19,7,Restaurants,taxes,Marketing,Not needed,Missing,Executed
1,user_2,2021-11-10,977,Health,direct_debit,Sales,Missing,Filled in,Declined


In [118]:
test = prompt_to_filter(
    "filter the dataframe to only keep the rows where the amount is greater than 900"
)

In [124]:
type(test.convert_dtypes())

pandas.core.frame.DataFrame

In [245]:
def dataframe_insights(prompt, data_to_filter):
    """
    Analyzes a given DataFrame with a specific prompt using OpenAI's GPT model.

    Args:
    - prompt (str): A prompt describing the analysis or question for the DataFrame.
    - data (DataFrame): The DataFrame to analyze.
    - OPENAI_API_KEY (str): The API key for OpenAI.

    Returns:
    - str: The response from the OpenAI model.

    Raises:
    - ValueError: If an invalid API key is provided.
    - RuntimeError: If the maximum number of retries is reached.

    This function attempts up to 5 retries in case of transient errors.
    """

    OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
    retry_count = 0
    max_retries = 5
    wait_seconds = 2  # Time to wait between retries

    data = create_temporal_features(data_to_filter, "date")

    while retry_count < max_retries:
        try:
            agent = create_pandas_dataframe_agent(
                ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613"),
                data,
                verbose=True,
                agent_type=AgentType.OPENAI_FUNCTIONS,
                handle_parsing_errors=True,
            )
            response = agent.run(prompt)
            return response

        except Exception as e:
            retry_count += 1
            if retry_count == max_retries:
                return "The function did not work, please let the user know that we are not able to give an answer to this question."
            time.sleep(wait_seconds)

In [67]:
data.head(2)

Unnamed: 0,user,date,amount,trx_category,method,team,receipt,VAT,Status
0,user_3,2020-11-19,7,Restaurants,taxes,Marketing,Not needed,Missing,Executed
1,user_2,2021-11-10,977,Health,direct_debit,Sales,Missing,Filled in,Declined


In [230]:
aaa = dataframe_insights(
    "What is the average amount, number of transactions and most frequent trx_category for the user 3 ?"
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "df_user_3 = df[df['user'] == 'user_3']\navg_amount = df_user_3['amount'].mean()\nnum_transactions = df_user_3.shape[0]\nmost_frequent_category = df_user_3['trx_category'].mode()[0]\n(avg_amount, num_transactions, most_frequent_category)"}`


[0m[36;1m[1;3m(497.11009174311926, 327, 'Shopping')[0m[32;1m[1;3mThe average amount for user 3 is 497.11, the number of transactions is 327, and the most frequent trx_category is "Shopping".[0m

[1m> Finished chain.[0m


In [231]:
aaa

'The average amount for user 3 is 497.11, the number of transactions is 327, and the most frequent trx_category is "Shopping".'

In [181]:
data.VAT.value_counts()

Filled in    503
Missing      497
Name: VAT, dtype: int64

# Main LLLM

In [81]:
import openai as openai

In [197]:
print(prompts.assistant_instructions)


You are an assistant tasked with analyzing customer transaction data. Your objective is to swiftly and accurately respond to user inquiries regarding their transaction. There are always 2 things to do : share insights regarding the question - either the actual answer or some insights on the transactions displayed - and filter the relevant transactions.
You can do that using two functions:

1. `dataframe_insights`: Use this function to derive insights from the data based on the user's question. You can use this function multiple times to get the number of transactions, the total amount and the most common trx_category.

2. `dataframe_filter`: This function allows you to filter the transaction data according to the user's query. Filtering occurs automatically based on the formulated prompt.

For user queries focusing solely on filtering data, provide a brief summary of the filtered data using dataframe_filter, including the number of transactions, average transaction amount, and the mos

In [270]:
# Create or load assistant
def create_assistant(client):
    assistant_file_path = "assistant.json"

    # Load existing assistant if file exists
    if os.path.exists(assistant_file_path):
        with open(assistant_file_path, "r") as file:
            assistant_data = json.load(file)
            assistant_id = assistant_data["assistant_id"]
            print("Loaded existing assistant ID.")
    else:
        # Create a new assistant if file does not exist

        assistant = client.beta.assistants.create(
            instructions=prompts.assistant_instructions,
            model="gpt-4-1106-preview",
            tools=[
                # {"type": "retrieval"},
                {
                    "type": "function",
                    "function": {
                        "name": "prompt_to_filter",
                        "description": "Applies a given prompt to filter the transaction dataframe from the user.",
                        "parameters": {
                            "type": "object",
                            "properties": {
                                "prompt": {
                                    "type": "string",
                                    "description": "A prompt describing the filter criteria for the DataFrame.",
                                },
                                "data_filtered": {
                                    "type": "string",
                                    "description": "The transaction dataframe to be filtered",
                                },
                            },
                            "required": ["prompt"],
                        },
                    },
                },
                {
                    "type": "function",
                    "function": {
                        "name": "dataframe_insights",
                        "description": "Given a prompt, analyzes the transaction dataframe and outputs insights.",
                        "parameters": {
                            "type": "object",
                            "properties": {
                                "prompt": {
                                    "type": "string",
                                    "description": "A prompt describing the analysis or question for the DataFrame.",
                                },
                                "data_filtered": {
                                    "type": "string",
                                    "description": "The transaction dataframe to analyse",
                                },
                            },
                            "required": ["prompt"],
                        },
                    },
                },
            ],
            # file_ids=[file.id],
        )

        # Save assistant ID to file for future runs
        with open(assistant_file_path, "w") as file:
            json.dump({"assistant_id": assistant.id}, file)
            print("Created a new assistant and saved the ID.")

        assistant_id = assistant.id

    return assistant_id

In [271]:
client = openai.OpenAI(api_key=OPENAI_API_KEY)

In [272]:
assistant_id = create_assistant(client)

Created a new assistant and saved the ID.


In [273]:
def start_conversation():
    print("Starting a new conversation...")
    thread = client.beta.threads.create()
    print(f"New thread created with ID: {thread.id}")
    return {"thread_id": thread.id}

In [238]:
def chat(thread_id, user_input, data_filterered):
    if not thread_id:
        print("Error: Missing thread_id")
        return jsonify({"error": "Missing thread_id"}), 400

    print(f"Received message: {user_input} for thread ID: {thread_id}")

    # Add the user's message to the thread
    client.beta.threads.messages.create(
        thread_id=thread_id, role="user", content=user_input
    )

    # Run the Assistant
    run = client.beta.threads.runs.create(
        thread_id=thread_id, assistant_id=assistant_id
    )

    # Check if the Run requires action (function call)
    while True:
        run_status = client.beta.threads.runs.retrieve(
            thread_id=thread_id, run_id=run.id
        )
        # print(f"Run status: {run_status.status}")
        if run_status.status == "completed":
            break
        elif run_status.status == "requires_action":
            # Handle the function call
            for tool_call in run_status.required_action.submit_tool_outputs.tool_calls:
                if tool_call.function.name == "dataframe_insights":
                    # Process solar panel calculations
                    arguments = json.loads(tool_call.function.arguments)
                    output = dataframe_insights(arguments["prompt"], data_filterered)
                    client.beta.threads.runs.submit_tool_outputs(
                        thread_id=thread_id,
                        run_id=run.id,
                        tool_outputs=[
                            {"tool_call_id": tool_call.id, "output": json.dumps(output)}
                        ],
                    )
                elif tool_call.function.name == "prompt_to_filter":
                    # Process lead creation
                    arguments = json.loads(tool_call.function.arguments)
                    filtered_dataframe = prompt_to_filter(
                        arguments["prompt"], data_filterered
                    )

                    client.beta.threads.runs.submit_tool_outputs(
                        thread_id=thread_id,
                        run_id=run.id,
                        tool_outputs=[
                            {
                                "tool_call_id": tool_call.id,
                                "output": json.dumps(filtered_dataframe.to_json()),
                            }
                        ],
                    )
            time.sleep(1)  # Wait for a second before checking again

    # Retrieve and return the latest message from the assistant
    messages = client.beta.threads.messages.list(thread_id=thread_id)
    response = messages.data[0].content[0].text.value

    print(f"Assistant response: {response}")
    return {"response": response, "data_filterered": filtered_dataframe}

In [237]:
thread = start_conversation()
thread["thread_id"]

Starting a new conversation...
New thread created with ID: thread_jQt6e2SU17pmGR42WD8m1asW


'thread_jQt6e2SU17pmGR42WD8m1asW'

In [253]:
response = chat(
    thread["thread_id"],
    "Show me all transactions above 800 for the marketingt team",
    data_filterered.query("user == 'user_3'").drop(columns=["user"]),
)

Received message: Show me all transactions above 800 for the marketingt team for thread ID: thread_jQt6e2SU17pmGR42WD8m1asW
Assistant response: The Marketing team has made a total of 12 transactions with amounts greater than 800. The details of these transactions are as follows:

- Dates range from 2020-05-03 to 2021-12-25.
- Amounts range from 804 to 997.
- Transaction categories include Fees, Mobile, Shopping, Hotel, Transfer, ATM, and Travel.
- Payment methods used are card payment, transfer, fees, direct debit, and taxes.
- The status of receipts varies between Missing, Not needed, and Attached.
- VAT statuses include both Filled in and Missing.
- The transactions' statuses are Declined, Executed, and Processing.


In [254]:
print(response["response"])

The Marketing team has made a total of 12 transactions with amounts greater than 800. The details of these transactions are as follows:

- Dates range from 2020-05-03 to 2021-12-25.
- Amounts range from 804 to 997.
- Transaction categories include Fees, Mobile, Shopping, Hotel, Transfer, ATM, and Travel.
- Payment methods used are card payment, transfer, fees, direct debit, and taxes.
- The status of receipts varies between Missing, Not needed, and Attached.
- VAT statuses include both Filled in and Missing.
- The transactions' statuses are Declined, Executed, and Processing.


In [255]:
response["data_filterered"]

Unnamed: 0,date,amount,trx_category,method,team,receipt,VAT,Status
93,2021-04-03,921,Fees,card,Marketing,Missing,Filled in,Declined
209,2021-12-25,892,Mobile,transfer,Marketing,Not needed,Missing,Declined
214,2021-01-12,927,Shopping,fees,Marketing,Not needed,Filled in,Declined
265,2021-08-30,997,Hotel,card,Marketing,Attached,Missing,Declined
280,2021-07-05,868,Transfer,card,Marketing,Missing,Missing,Declined
293,2020-05-03,804,Hotel,direct_debit,Marketing,Attached,Filled in,Executed
677,2021-09-07,857,ATM,direct_debit,Marketing,Missing,Missing,Executed
754,2020-11-29,824,Travel,taxes,Marketing,Missing,Missing,Processing
762,2021-06-17,812,ATM,fees,Marketing,Missing,Missing,Processing
777,2021-02-02,813,Travel,fees,Marketing,Attached,Missing,Processing


In [264]:
response = chat(
    thread["thread_id"],
    "SHow me the transactions for the first week of each month",
    data_filterered.query("user == 'user_3'").drop(columns=["user"]),
)

Received message: SHow me the transactions for the first week of each month for thread ID: thread_jQt6e2SU17pmGR42WD8m1asW


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [259]:
response["response"]

'The Marketing team has spent the most, with a total expenditure of 109,313. The transactions for the Marketing team are now filtered for further insights.'

In [261]:
response["data_filterered"]

Unnamed: 0,date,amount,trx_category,method,team,receipt,VAT,Status
0,2020-11-19,7,Restaurants,taxes,Marketing,Not needed,Missing,Executed
27,2020-10-16,293,Train,card,Marketing,Missing,Filled in,Declined
81,2021-09-01,726,Finance,fees,Marketing,Not needed,Missing,Declined
93,2021-04-03,921,Fees,card,Marketing,Missing,Filled in,Declined
105,2020-02-21,576,Shopping,direct_debit,Marketing,Missing,Missing,Processing
...,...,...,...,...,...,...,...,...
911,2021-04-07,805,Fees,taxes,Marketing,Missing,Missing,Processing
919,2020-02-24,718,Sport,taxes,Marketing,Not needed,Filled in,Declined
932,2021-01-03,486,Food & Grocery,transfer,Marketing,Missing,Filled in,Executed
956,2021-06-23,1,Travel,direct_debit,Marketing,Attached,Missing,Declined


In [263]:
data_filterered.groupby("team").sum()

  data_filterered.groupby("team").sum()


Unnamed: 0_level_0,amount
team,Unnamed: 1_level_1
Finance,93940
HR,93926
IT,103929
Marketing,109313
Sales,94907
