In [0]:
%pip install bs4 databricks-agents langchain-text-splitters --quiet
%restart_python

## A Chunky View on Les Miserables

In [0]:
from langchain_text_splitters import HTMLHeaderTextSplitter
import requests

response = requests.get('https://www.gutenberg.org/cache/epub/135/pg135-images.html')
miserables_text = response.text

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits = html_splitter.split_text(miserables_text)

This gives us 369 chunks, 365 of which are chapters.

In [0]:
import matplotlib.pyplot as plt
import numpy as np

valid_chunks = [x for x in html_header_splits if len(x.page_content) > 1000][1:]
valid_chunk_lengths = [len(x.page_content) for x in valid_chunks]

plt.style.use('ggplot')
plt.hist(valid_chunk_lengths, bins=20, edgecolor='black')
plt.title('Histogram of Valid Chunk Lengths')
plt.xlabel('Length of Valid Chunks')
plt.ylabel('Frequency')
plt.show()

In [0]:
import pandas as pd

def extract_passage(passage):
  return {
    "header_2": passage.metadata.get('Header 2',""),
    "header_3": passage.metadata.get('Header 3',""),
    "page_content": passage.page_content
}
  
extracted_passages = [extract_passage(x) for x in valid_chunks]
les_mis_df = pd.DataFrame(extracted_passages)
les_mis_df

In [0]:
from databricks.sdk import WorkspaceClient

w = WorkspaceClient()

workspace_client = WorkspaceClient()
workspace_url = workspace_client.config.host

# Check if running in Databricks
import os

if "DATABRICKS_RUNTIME_VERSION" in os.environ:
    token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
else:
    token = workspace_client.config.token

In [0]:
passage = valid_chunks[3]

In [0]:
from openai import OpenAI

client = OpenAI(
    api_key=token,
    base_url=f"{workspace_url}/serving-endpoints",
)

# Query AI Gateway
response = client.chat.completions.create(
    model='azure-gpt-4o-mini',
    messages=[
        {"role": "user", "content": f"""
         Take this passage from Les Miserables and do structured data extraction in JSON. I want you to provide the title of the chapter, a list of characters, a synopsis of the chapter, and the overall sentiment of the chapter - positive, neutral, or negative. Do not make up anything if the passage isn't part of the novel.
         
         {passage.metadata['Header 2']}
         {passage.metadata['Header 3']}
         {passage.page_content}
         """}
    ],
    extra_headers={"client_request_id":'test'}
)

# Extract json response
response.choices[0].message.content.replace("json\n","").replace("```","")

## Initialize Ray Cluster

In [0]:
from ray.util.spark import setup_ray_cluster
import ray

setup_ray_cluster(
  min_worker_nodes=3,
  max_worker_nodes=3,
)
ray.init(ignore_reinit_error=True)

In [0]:
data_list = les_mis_df.to_dict(orient='records')
data_list[1]

In [0]:
import time

@ray.remote(num_cpus=1)
def extract_data_from_passage_ray(data_dict):
    try:
        start_time = time.time()
    
        client = OpenAI(
            api_key=token,
            base_url=f"{workspace_url}/serving-endpoints",
        )
        
        response = client.chat.completions.create(
            model='azure-gpt4o',
            messages=[
                {"role": "user", "content": f"""
                Take this passage from Les Miserables and do structured data extraction in JSON. I want you to provide the title of the chapter, a list of characters, a synopsis of the chapter, and the overall sentiment of the chapter - positive, neutral, or negative. Do not make up anything if the passage isn't part of the novel. Also include 'experiment: gpt4o-ray'

                Output Format:
                    title: 
                    characters: []
                    synopsis:
                    sentiment:
                    experiment:

                {data_dict['header_2']}
                {data_dict['header_3']}
                {data_dict['page_content']}
                """}
            ],
        )
        
        end_time = time.time()
        elapsed_time = end_time - start_time
        
        data_dict['extracted_data'] = response.choices[0].message.content.replace("json\n","").replace("","") + f"time: {elapsed_time:.2f}"
    except: 
        data_dict['extracted_data'] = "ERROR"

    return data_dict

In [0]:
futures = [extract_data_from_passage_ray.remote(data) for data in data_list] 
results = ray.get(futures)

In [0]:
pd_df = pd.DataFrame(results)
pd_df.iloc[3].extracted_data

In [0]:
# Save as files since we've hijacked our spark clusters
pd_df.to_csv("/Volumes/shm/default/llm_profiling/les_mis_df-azure-o1-ray.csv")