In [2]:
from dotenv import load_dotenv
import os
load_dotenv()


True

In [3]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

## Schema Extraction

In [4]:
import pandas as pd
import json

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [5]:
df = pd.read_csv('jobs_in_data.csv')

In [6]:
def extract_metadata(df: pd.DataFrame, col_limit=150):
    if df.shape[1] > col_limit:
        raise ValueError(f"Dataframe exceeds col limit of {col_limit}")
    
    df = df.copy()
    metadata_dict = {
        'column_names': df.columns.tolist(),
        'data_dimensions': df.shape,
        'data_types_per_column': df.dtypes.apply(lambda x: x.name).to_dict(),
        'statistical_summary': df.describe().to_dict(),
        'missing_values_per_column': df.isnull().sum().to_dict()
    }
    return json.dumps(metadata_dict)

In [7]:
metadata_json = extract_metadata(df)

## Code Generation LLM

In [8]:
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts.chat import ChatPromptTemplate



In [9]:
class CodeAndDescription(BaseModel):
    code: str = Field(description="code to generate the plots")
    description: str = Field(description="brief description of the code and plots")

    # @validator("code")
    # def code_must_be_python(cls, v):
    #     if not v.startswith("```python"):
    #         raise ValueError("code must be python")
    #     return v
    

parser = PydanticOutputParser(pydantic_object=CodeAndDescription)

In [11]:
print(parser.get_format_instructions())

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"code": {"title": "Code", "description": "code to generate the plots", "type": "string"}, "description": {"title": "Description", "description": "brief description of the code and plots", "type": "string"}}, "required": ["code", "description"]}
```


In [9]:

system_template_1 = "You are a coding assistant designed to create code to plot graphs in python in plotly.You should return a breif description and the code to plot the graph"
system_template_2 = "Here is some metadata about the data: {metadata_json}"
human_message_template = "{human_message} /n {format_instructions}"

chat_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_template_1),
        ("system", system_template_2),
        ("human", human_message_template),
    ]
)

messages = chat_prompt.format_messages(
    metadata_json=metadata_json,
    human_message="Make a plot of the average salary of every job.",
    format_instructions=parser.get_format_instructions(),
)

In [10]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", api_key=OPENAI_API_KEY, temperature=1)

In [11]:
response = llm.invoke(messages)

In [12]:
formatted_response = parser.parse(response.content)


In [13]:
formatted_response = parser.parse(response.content)

print(formatted_response.description)
print('-'*50)
print(formatted_response.code)


This code calculates the average salary for each job category and creates a bar plot using Plotly Express. The x-axis represents the job categories, the y-axis represents the average salary, and each bar is color-coded by job category. The plot shows the average salary for each job category.
--------------------------------------------------
import plotly.express as px

# Group the data by job_category and calculate the average salary
avg_salary_by_job = data.groupby('job_category')['salary'].mean().reset_index()

# Use plotly express to create a bar plot
fig = px.bar(avg_salary_by_job, x='job_category', y='salary', color='job_category', title='Average Salary by Job Category')

# Display the plot
def plot_average_salary_by_job():
    fig.show()

plot_average_salary_by_job()


## Code Execution and Visualisation


In [15]:
import plotly.express as px

# Group the data by job title and calculate the average salary
average_salary = df.groupby('job_title')['salary'].mean().reset_index()

# Plot the data
fig = px.bar(average_salary, x='job_title', y='salary', title='Average Salary by Job Title')
# make label text larger
fig.update_layout(
    font=dict(
        size=40
    )
)
fig.show()
# save image as jpg
fig.write_image("average_salary.jpg")

In [16]:
import base64
import requests

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

# Path to your image
image_path = "average_salary.jpg"

# Getting the base64 string
base64_image = encode_image(image_path)

headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {OPENAI_API_KEY}"
}

payload = {
  "model": "gpt-4-vision-preview",
  "messages": [
    {
      "role": "system",
      "content": """You are a highly skilled validation agent who will be examining plots genreated from pythons plotly library.
      You should validate if the plot is legible any anything is majorly wrong with it. such examples could be:
      - The plot is not legible
      - axis labels read to read
      - Something is obviously wrong with the plot
      If you find anything wrong with the plot, please provide feedback on how to fix it in plain english. If the plot is sufficently legible, please return "The plot is legible".
      """
    },
    # TODO: insert the code description in here?
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "Please validate this plot I made using the plotly library"
        },
        {
          "type": "image_url",
          "image_url": {
            "url": f"data:image/jpeg;base64,{base64_image}"
          }
        }
      ]
    }
  ],
  "max_tokens": 300
}

response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

print(response.json())

{'id': 'chatcmpl-8mqUK67W6UZdzR2PpZhp8pimXqpZg', 'object': 'chat.completion', 'created': 1706652200, 'model': 'gpt-4-1106-vision-preview', 'usage': {'prompt_tokens': 443, 'completion_tokens': 300, 'total_tokens': 743}, 'choices': [{'message': {'role': 'assistant', 'content': 'To make the plot clearer, consider the following suggestions:\n\n1. Axis Labels: Make sure that both the y-axis and x-axis have clear, legible labels that describe what they represent. For example, the y-axis could be labeled "Average Salary (USD)" if that is appropriate, and the x-axis could be labeled "Job Title".\n\n2. Scale and Ticks: Adjust the scale of the y-axis so that it fits the range of the data more closely, which could help in differentiating the salaries among different job titles. Also, ensure that the tick marks on the axes are appropriately spaced and labeled for easy reading.\n\n3. Increase Text Size: Increase the size of the text for the job titles so they are more readable. Avoid overlapping te

In [19]:
response.json()

{'id': 'chatcmpl-8mqUK67W6UZdzR2PpZhp8pimXqpZg',
 'object': 'chat.completion',
 'created': 1706652200,
 'model': 'gpt-4-1106-vision-preview',
 'usage': {'prompt_tokens': 443,
  'completion_tokens': 300,
  'total_tokens': 743},
 'choices': [{'message': {'role': 'assistant',
    'content': 'To make the plot clearer, consider the following suggestions:\n\n1. Axis Labels: Make sure that both the y-axis and x-axis have clear, legible labels that describe what they represent. For example, the y-axis could be labeled "Average Salary (USD)" if that is appropriate, and the x-axis could be labeled "Job Title".\n\n2. Scale and Ticks: Adjust the scale of the y-axis so that it fits the range of the data more closely, which could help in differentiating the salaries among different job titles. Also, ensure that the tick marks on the axes are appropriately spaced and labeled for easy reading.\n\n3. Increase Text Size: Increase the size of the text for the job titles so they are more readable. Avoid o